# Libraries
library(here)
library(vroom)
library(knitr)
library(kableExtra)
library(lubridate)
library(viridis)
library(janitor)
library(sciplot)
library(meta)
library(broom)
library(MASS)
library(ez)
library(car)
library(pastecs)
library(qqman)
library(ggrepel)
library(fs)
library(grid)
library(tiff)
library(lm.beta)
library(rsq)
library(tidyverse)
knitr::opts_chunk$set(fig.width = 8.5, warning = FALSE, message = FALSE, engine.opts = list(bash = "-l"))
# Set theme for ggplot
theme_set(theme_bw())
GWAS Code
Murray first ran a genome-wide association study in the UK Biobank European cohort with gout as the outcome. This was done with a total of 27,287,012 variants (after imputation), and adjusted for age, sex, and the first 40 principal components. The code for this is seen below:
# Save the following as do_gout_gwas.sh
file=$1
mem=36000
plink1.9b6.10 --vcf ${file}.vcf.gz --make-bed --out ${file} --memory $mem
plink1.9b6.10 --bfile ${file} --out ${file}_tmp --keep gout_gwas_keep_ids_w_sex.txt --pheno gout_gwas_covar.covar --pheno-name plink_goutaff --update-sex gout_gwas_keep_ids_w_sex.txt --make-bed --memory $mem
# --geno 0.1 is default
plink1.9b6.10 --bfile ${file}_tmp --logistic sex --freq case-control --geno 0.1 --missing --ci 0.95 --maf 0.0001 --hwe 0.000001 --hardy --out gout_gwas/${file} --covar gout_gwas_covar.covar --covar-name Age,pc1-pc40 --memory $mem
rm ${file}.{bed,bim,fam} ${file}_tmp.{bed,bim,fam}
tr -s ' ' < gout_gwas/${file}.assoc.logistic | tr ' ' '\t' | sed 's/^\t//g' | sed 's/\t$//g' | gzip -c > gout_gwas/${file}.assoc.logistic.tsv.gz && rm gout_gwas/${file}.assoc.logistic
tr -s ' ' < gout_gwas/${file}.frq.cc | tr ' ' '\t' | sed 's/^\t//g' | sed 's/\t$//g' | gzip -c > gout_gwas/${file}.frq.cc.tsv.gz && rm gout_gwas/${file}.frq.cc
tr -s ' ' < gout_gwas/${file}.hwe | tr ' ' '\t' | sed 's/^\t//g' |sed 's/\t$//g' | gzip -c > gout_gwas/${file}.hwe.tsv.gz && rm gout_gwas/${file}.hwe
tr -s ' ' < gout_gwas/${file}.imiss | tr ' ' '\t' | sed 's/^\t//g' | sed 's/\t$//g' | gzip -c > gout_gwas/${file}.imiss.tsv.gz && rm gout_gwas/${file}.imiss
tr -s ' ' < gout_gwas/${file}.lmiss | tr ' ' '\t' | sed 's/^\t//g' | sed 's/\t$//g' | gzip -c > gout_gwas/${file}.lmiss.tsv.gz && rm gout_gwas/${file}.lmiss
# Then run this code
cd /Volumes/archive/merrimanlab_nobackup/ukbio/EGAD00010001474/splits
parallel --progress -P parallel_process.txt --tmp ./tmp/ 'bash do_gout_gwas.sh {}' ::: $(basename -s .vcf.gz -a $(ls bgenix_convert_chr*.gz ))
# These were then concatenated together into ukbb_gout-allcontrol_chr1-22.X.XY.add_unfiltered_p.tsv which was copied to the Data directory
Next, I filtered the summary stats to only include SNPs that were genotyped in the CoreExome chip. Additionally, I removed any indels, variants with MAF < 0.01, and X/Y chromosome SNPs, and ensured that all variants were biallelic.
CoreExPheno <- read_delim(here("Data/Phenotypes/CZ-MB1.2-QC1.10_MergedPhenotypes_20082020.txt"), delim = "\t")
All_CoreEx_ID <- read_delim("/Volumes/archive/merrimanlab/raid_backup/New_Zealand_Chip_data/CoreExome/QC_MergedBatches/Final_Data/CZ-MB1.2-QC1.10_CoreExome24-1.0-3_genotyped-QCd_rsIDconverted.fam", delim = " ", col_names = F)
CoreExPheno_Euro <- CoreExPheno %>%
filter(Geno.BroadAncestry == "European",
Geno.SampleID %in% All_CoreEx_ID$X2,
General.Use != "No",
!(Pheno.Study %in% c("Auckland Controls", "Australian Controls", "ESR", "Rheumatoid Arthritis")))
CoreExPheno_Poly <- CoreExPheno %>%
filter(Geno.BroadAncestry == "Oceanian",
Geno.SampleID %in% All_CoreEx_ID$X2,
General.Use != "No",
!(Pheno.Study %in% c("ESR", "Pacific Trust")),
!is.na(Pheno.GoutSummary))
all_coreex_ids <- rbind(CoreExPheno_Euro, CoreExPheno_Poly) %>%
select(Geno.FamilyID, Geno.SampleID)
# write_delim(all_coreex_ids, delim = "\t", file = here("Output/Temp/all_coreex_ids.txt"), col_names = F)
rm(CoreExPheno, CoreExPheno_Euro, CoreExPheno_Poly, all_coreex_ids, All_CoreEx_ID)
# system(paste0("source ~/.bashrc; plink1.9b4.9 --bfile /Volumes/archive/merrimanlab/raid_backup/New_Zealand_Chip_data/CoreExome/QC_MergedBatches/Final_Data/CZ-MB1.2-QC1.10_CoreExome24-1.0-3_genotyped-QCd_rsIDconverted --keep ", here("Output/Temp/all_coreex_ids.txt"), " --geno 0.05 --make-bed --out ", here("Output/Temp/inCoreExGeno")))
geno <- read_delim(here("Output/Temp", "inCoreExGeno.bim"), delim = "\t", col_names = FALSE) %>%
mutate(CHR_BP = paste0(X1, "_", X4))
sumstat <- vroom(here("Data/GWAS/ukbb_gout-allcontrol_chr1-22.X.XY.add_unfiltered_p.tsv"),
delim = "\t",
col_names = TRUE)
sumstat2 <- sumstat %>%
mutate(CHR_BP = paste0(CHR, "_", BP)) %>%
filter(CHR %in% 1:22,
CHR_BP %in% geno$CHR_BP,
str_length(A1) == 1) %>%
separate(SNP, into = c("SNP1", "SNP2", "SNP3"), sep = ",", remove = FALSE) # total of 307,368 variants
# table(sumstat2$SNP3) # just chromosomes, can remove
sumstat2_2 <- sumstat2 %>%
select(-SNP3)
test1 <- sumstat2_2 %>%
filter(str_detect(SNP1, regex("^rs[0-9]+"))) # 307,066 variants have an rsID in SNP1 column
test1_1 <- test1 %>%
filter(str_detect(SNP2, regex("^rs[0-9]+"))) # 74,955 of these have an rsID in SNP2 column
test1_1_1 <- test1_1 %>%
filter(SNP1 != SNP2) # 63 variants have two rsIDs, for the most part SNP1 appears to be the newest rsID, but I will keep the extra rsID in a separate column
test1_2 <- test1 %>%
filter(str_detect(SNP2, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$"))) # 231,166 variants have rsID in SNP1 column and chr:bp_a1_a2 in SNP2 column
test1_2_1 <- test1_2 %>%
mutate(CHR2 = SNP2 %>% str_split(":", simplify = TRUE) %>% .[,1] %>% as.numeric(),
BP2 = SNP2 %>% str_split(":", simplify = TRUE) %>% .[,2] %>% str_split("_", simplify = TRUE) %>% .[,1] %>% as.numeric(),
Allele1 = SNP2 %>% str_split("_", simplify = TRUE) %>% .[,2],
Allele2 = SNP2 %>% str_split("_", simplify = TRUE) %>% .[,3])
# sum(test1_2_1$BP != test1_2_1$BP2) # all BPs are equal
# sum(test1_2_1$CHR != test1_2_1$CHR2) # all CHRs are equal
# sum(test1_2_1$A1 != test1_2_1$Allele2) # Allele2 is not always A1
test1_2_2 <- test1_2_1 %>%
select(-CHR2, -BP2) %>%
filter(str_length(Allele1) == 1,
str_length(Allele2) == 1) # removes a further 13 indels = 231,153 variants
test1_2_final <- test1_2_2 %>%
select(SNP, Allele1, Allele2)
test1_3 <- test1 %>%
filter(!str_detect(SNP2, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$|^rs[0-9]+$"))) # 945 variants have neither rsID nor chr:bp_a1_a2 in SNP2 column
sum(!str_detect(test1_3$SNP2, regex("^Affx-[0-9]+$"))) # All of these are in the format Affx-<number>
test1_final <- test1 %>%
mutate(RSID = SNP1,
ALT_RSID = case_when(str_detect(SNP2, regex("^rs[0-9]+$")) & SNP1 != SNP2 ~ SNP2, TRUE ~ NA_character_),
AFFYID = case_when(str_detect(SNP2, regex("^Affx-[0-9]+$")) ~ SNP2, TRUE ~ NA_character_),
SNP_ID = case_when(str_detect(SNP2, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$")) ~ SNP2, TRUE ~ NA_character_)) %>%
left_join(test1_2_final, by = "SNP")
test1_final2 <- test1_final %>%
filter(is.na(SNP_ID) | (!is.na(SNP_ID) & !is.na(Allele1))) # removing the indels
test2 <- sumstat2_2 %>%
filter(!str_detect(SNP1, regex("^rs[0-9]+"))) # 302 variants don't have an rsID in SNP1 column
test2_1 <- test2 %>%
filter(str_detect(SNP1, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$"))) # 300 have the SNP_ID format in SNP1
test2_1_1 <- test2_1 %>%
mutate(CHR2 = SNP1 %>% str_split(":", simplify = TRUE) %>% .[,1] %>% as.numeric(),
BP2 = SNP1 %>% str_split(":", simplify = TRUE) %>% .[,2] %>% str_split("_", simplify = TRUE) %>% .[,1] %>% as.numeric(),
Allele1 = SNP1 %>% str_split("_", simplify = TRUE) %>% .[,2],
Allele2 = SNP1 %>% str_split("_", simplify = TRUE) %>% .[,3])
# sum(test2_1_1$BP != test2_1_1$BP2) # all BPs are equal
# sum(test2_1_1$CHR != test2_1_1$CHR2) # all CHRs are equal
# sum(test2_1_1$A1 != test2_1_1$Allele2) # Allele2 is not always A1
test2_1_2 <- test2_1_1 %>%
select(-CHR2, -BP2) %>%
filter(str_length(Allele1) == 1,
str_length(Allele2) == 1)
# nrow(test2_1_2) - nrow(test2_1_1) # removed 12 indels
test2_1_final <- test2_1_2 %>%
select(SNP, Allele1, Allele2) # 288 total
test2_1_1 <- test2_1 %>%
filter(str_detect(SNP2, regex("^rs[0-9]+"))) # 7 of the 300 have RSID in SNP2
test2_1_2 <- test2_1 %>%
filter(str_detect(SNP2, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$"))) # 292 have SNP_ID in SNP2
# sum(test2_1_2$SNP1 != test2_1_2$SNP2) # All SNP_ID columns identical in these individuals
test2_1_3 <- test2_1 %>%
filter(str_detect(SNP2, regex("^Affx-[0-9]+$"))) # 1 has AffyID in SNP2
test2_2 <- test2 %>%
filter(str_detect(SNP1, regex("^Affx-[0-9]+$"))) # 2 have an AffyID in SNP1
test2_2_1 <- test2_2 %>%
filter(str_detect(SNP2, regex("^Affx-[0-9]+$"))) # Both have AffyID in SNP2
# sum(test2_2_1$SNP1 != test2_2_1$SNP2) # All AffyID columns identical in these individuals
test2_final <- test2 %>%
mutate(RSID = case_when(str_detect(SNP2, regex("^rs[0-9]+$")) ~ SNP2, TRUE ~ NA_character_),
ALT_RSID = NA_character_,
AFFYID = case_when(str_detect(SNP2, regex("^Affx-[0-9]+$")) ~ SNP2, TRUE ~ NA_character_),
SNP_ID = case_when(str_detect(SNP1, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$")) ~ SNP1, TRUE ~ NA_character_)) %>%
left_join(test2_1_final, by = "SNP")
test2_final2 <- test2_final %>%
filter(is.na(SNP_ID) | (!is.na(SNP_ID) & !is.na(Allele1))) # removing the indels
sumstat3 <- rbind(test1_final2, test2_final2) %>% arrange(CHR, BP) # 307,343 variants
remove <- ls()
remove <- as_tibble(remove) %>%
filter(str_detect(value, "test"))
remove <- remove$value
rm(list = remove, remove)
rm(sumstat2_2, tmp)
tmp <- sumstat3 %>%
select(CHR, BP) %>%
unique()
for(i in 1:22) {
write_delim(select(filter(tmp, CHR == i), BP), file = paste0(here("Output/Temp/"), "chr", i, "_snplist.txt"), delim = "\n")
}
# Now to pull out the MAF and alleles for all SNPs using the mfi files
#system(paste0('source ~/.bashrc; parallel "grep -Fwhf ', here("Output/Temp/"), 'chr{}_snplist.txt /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/splits/ukb_mfi_chr{}_v3.txt > ', here("Output/Temp/"), 'ukb_maf_info_chr{}.txt" ::: {1..22}'))
out <- tibble()
for(i in 1:22) {
assign(paste0("chr", i, "_snps"), read_delim(here("Output/Temp", paste0("ukb_maf_info_chr", i, ".txt")), delim = "\t", col_names = FALSE) %>% mutate(CHR = i))
out <- rbind(out, get(paste0("chr", i, "_snps")))
rm(list = paste0("chr", i, "_snps"), i)
}
colnames(out) <- c("SNP1_mfi", "SNP2_mfi", "BP_mfi", "Allele1_mfi", "Allele2_mfi", "MAF_mfi", "Minor_Allele_mfi", "INFO_mfi", "CHR_mfi")
# sum(is.na(out$Allele1_mfi) | is.na(out$Allele2_mfi)) # all have allele1 and allele2
out <- out %>%
mutate(CHR_BP = paste0(CHR_mfi, "_", BP_mfi)) %>%
filter(Allele1_mfi %in% c("A", "C", "G", "T"),
Allele2_mfi %in% c("A", "C", "G", "T"),
MAF_mfi > 0.01,
MAF_mfi < 0.99,
INFO_mfi > 0.3) # 246,869 variants
sum(duplicated(out$CHR_BP)) # 224 multi-allelic sites
sumstat4 <- sumstat3 %>%
left_join(out, by = "CHR_BP") %>%
filter(!is.na(Allele1_mfi)) %>%
select(-CHR_mfi, -BP_mfi) # 246,017
geno2 <- geno %>%
filter(CHR_BP %in% sumstat4$CHR_BP,
X5 %in% c("A", "C", "G", "T"),
X6 %in% c("A", "C", "G", "T")) # only 244,655 variants
# length(unique(sumstat4$CHR_BP)) # same number as above, difference is multi-allelic sites
tmp <- sumstat4 %>%
filter(duplicated(CHR_BP)) %>%
pull(CHR_BP) %>%
unique() # 823 duplicated sites
tmp2 <- sumstat4 %>%
filter(!(CHR_BP %in% tmp)) # 244,015 biallelic sites
tmp3 <- sumstat4 %>%
filter(CHR_BP %in% tmp) # 2,002 total variants, need to go off the one that is in geno2
tmp4 <- tmp3 %>%
left_join(geno2, by = "CHR_BP") %>%
select(-X1, -X3, -X4) %>%
unique()
# test <- tmp4 %>%
# filter(Allele1 != Allele1_mfi | is.na(Allele1)) # Allele1_mfi can be changed to Allele1
tmp5 <- tmp4 %>%
filter(is.na(Allele2) | !is.na(Allele2) & Allele2 == Allele2_mfi)
tmp6 <- tmp5 %>%
filter((Allele1_mfi == X5 | Allele1_mfi == X6) & (Allele2_mfi == X5 | Allele2_mfi == X6))
tmp7a <- tmp6 %>%
filter(duplicated(CHR_BP))
tmp7b <- tmp6 %>%
filter(CHR_BP %in% tmp7a$CHR_BP)
tmp7 <- tmp6 %>%
filter(A1 == X5 | A1 == X6) # now all remaining variants are at unique locations
tmp2 <- sumstat4 %>%
filter(!(CHR_BP %in% tmp)) %>%
left_join(geno2, by = "CHR_BP") %>%
filter((Allele1_mfi == X5 | Allele1_mfi == X6) & (Allele2_mfi == X5 | Allele2_mfi == X6)) %>%
select(-X1, -X3, -X4)
sumstat4_1 <- rbind(tmp2, tmp7) %>%
arrange(CHR, BP) %>%
filter(Allele2 == Allele2_mfi | is.na(Allele2)) %>%
mutate(Allele1 = Allele1_mfi,
Allele2 = Allele2_mfi) %>%
select(-Allele1_mfi, -Allele2_mfi)
# test <- sumstat4_1 %>%
# filter(!(A1 == X5 | A1 == X6))
sum(sumstat4_1$SNP1 != sumstat4_1$SNP2_mfi) # same
sum(sumstat4_1$SNP2 != sumstat4_1$SNP1_mfi) # 50 different
test <- sumstat4_1 %>%
filter(SNP2 != SNP1_mfi) # nothing to worry about, can remove mfi versions
sumstat5 <- sumstat4_1 %>%
rename(Effect_Allele = A1,
INFO = INFO_mfi,
MAF = MAF_mfi,
Minor_Allele = Minor_Allele_mfi) %>%
select(-TEST, -NMISS, -CHR_BP, -X2, -X5, -X6, -SNP1_mfi, -SNP2_mfi)
test <- sumstat5 %>%
filter(Minor_Allele != Effect_Allele)
#summary(test$MAF) # all really close to 0.5 MAF, just need to flip OR, L95, and U95 then set Effect_Allele to Minor_Allele column
test <- test %>%
mutate(OR = 1/OR,
tmp = 1/L95,
tmp2 = 1/U95,
L95 = tmp2,
U95 = tmp,
Effect_Allele = Minor_Allele) %>%
rename(EAF = MAF) %>%
select(-tmp, -tmp2, -Minor_Allele)
sumstat5_1 <- sumstat5 %>%
filter(Minor_Allele == Effect_Allele) %>%
select(-Minor_Allele) %>%
rename(EAF = MAF) %>%
rbind(test) %>%
arrange(CHR, BP)
test <- sumstat5_1 %>%
filter(Allele2 == Effect_Allele) %>%
rename(Alternate_Allele = Allele1) %>%
select(CHR, SNP, BP, Effect_Allele, Alternate_Allele, OR:SNP_ID, EAF:INFO)
test2 <- sumstat5_1 %>%
filter(Allele1 == Effect_Allele) %>%
rename(Alternate_Allele = Allele2) %>%
select(CHR, SNP, BP, Effect_Allele, Alternate_Allele, OR:SNP_ID, EAF:INFO)
sumstat_final <- rbind(test, test2) %>%
arrange(CHR, BP)
# save(sumstat_final, file = here("Output/sumstat_final.RData"))
rm(geno, geno2, out, sumstat, sumstat2, sumstat3, sumstat4, sumstat4_1, sumstat4_final, sumstat5, sumstat5_1, sumstat5_2, test, test2, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp7a, tmp7b, tmp)
To get from these cleaned up summary statistics to the final list of SNPs for the PRS, I did the following:
I filtered out all SNPs with P-values greater than 5e-8.
I took each lead SNP within a 1 Mb window and used these to define 15 crude loci.
This list of lead SNPs was further filtered to only include one lead SNP per full locus.
- The boundaries of these “full loci” were defined based on two consecutive genome-wide significant SNPs being more than 500 kb apart.
Next, SNPs in the UK Biobank BGEN files were extracted if they fit within the boundaries of these “full loci”.
Conditional GWAS were run at each locus, conditioning on the lead SNP.
If there was a significant SNP (P < 5e-8) remaining after conditioning, the original lead SNP and the new lead SNP were used for a subsequent conditional GWAS at this locus.
This was repeated until no more significant SNPs (P < 5e-8) remained at each locus.
Locus zooms were plotted for each locus, using both the unconditioned and conditioned GWAS results.
Finally, the resulting list of 19 lead SNPs were saved in a single file ready for conversion to a PRS.
# Defining one SNP per locus ---------------------------------------------------------------------------
# First, filter out P < 5e-8 SNPs and arrange by P
load(here("Output/sumstat_final.RData"))
sumstat_signif <- sumstat_final %>%
filter(P <= 5e-8) %>%
arrange(P)
# Grouping into loci +- 500 kb of top SNPs
gout_top <- sumstat_signif %>%
slice(1)
gout2 <- sumstat_signif %>%
filter(!(CHR == gout_top$CHR[1] & BP %in% ((gout_top$BP[1] - 500000):(gout_top$BP[1] + 500000))))
while(nrow(gout2) > 0) {
tmp <- gout2 %>%
slice(1)
gout_top <- rbind(tmp, gout_top)
gout2 <- gout2 %>%
filter(!(CHR == gout_top$CHR[1] & BP %in% ((gout_top$BP[1] - 500000):(gout_top$BP[1] + 500000))))
}
gout_top <- gout_top %>%
arrange(CHR, BP)
# Finding regions of loci
sumstat_signif <- sumstat_signif %>%
arrange(CHR, BP)
out <- NA
for(i in 2:nrow(sumstat_signif)) {
if(sumstat_signif$CHR[i] == sumstat_signif$CHR[i - 1]){
out[i] <- sumstat_signif$BP[i] - sumstat_signif$BP[i - 1]
} else {
out[i] <- NA
}
}
tmp <- sumstat_signif %>%
mutate(Diff = out,
Diff2 = case_when(Diff < 500000 ~ Diff))
out <- sumstat_signif %>% slice(1)
for(i in 2:nrow(sumstat_signif)) {
if(is.na(tmp$Diff2[i])){
out <- rbind(out, sumstat_signif %>% slice(i - 1), sumstat_signif %>% slice(i))
}
}
out <- rbind(out, sumstat_signif %>% slice(nrow(sumstat_signif)))
# Extracting regions
bgen_ranges <- out %>% select(CHR, BP)
tmp1 <- bgen_ranges %>% slice(seq(1, nrow(bgen_ranges), by = 2)) %>% rename(BP1 = BP)
tmp2 <- bgen_ranges %>% slice(seq(2, nrow(bgen_ranges), by = 2)) %>% rename(CHR.x = CHR, BP2 = BP)
bgen_ranges <- tmp1 %>%
cbind(tmp2) %>%
mutate(BP1 = BP1 - 50000,
BP2 = BP2 + 50000) %>%
select(-CHR.x)
bgen_range1 <- bgen_ranges %>%
filter(CHR < 10) %>%
mutate(BGEN = paste0("0", CHR, ":", BP1, "-", BP2))
bgen_range2 <- bgen_ranges %>%
filter(CHR > 9) %>%
mutate(BGEN = paste0(CHR, ":", BP1, "-", BP2))
bgen_ranges <- rbind(bgen_range1, bgen_range2) %>%
arrange(CHR, BP1)
tmp <- bgen_ranges %>%
select(BGEN)
rm(bgen_range1, bgen_range2, tmp1, tmp2, out, gout2, i)
#write_delim(tmp, file = here("Output/Temp", "bgen_range.txt"), delim = "\n", col_names = F)
# Extracting all SNPs from biallelic sumstat that fit within boundaries of loci
loci <- tibble()
for(i in 1:nrow(bgen_ranges)){
tmp <- sumstat_final %>%
filter(CHR == bgen_ranges$CHR[i] & between(BP, bgen_ranges$BP1[i], bgen_ranges$BP2[i]))
loci <- rbind(loci, tmp)
}
loci <- loci %>%
mutate(SNP_ID2 = paste0(CHR, "_", BP, "_", Alternate_Allele, "_", Effect_Allele))
tmp <- loci %>%
mutate(BP2 = BP) %>%
select(CHR, BP, BP2, SNP)
#write_delim(tmp, file = here("Output/Temp", "loci_snps.txt"), delim = "\t", col_names = F)
out <- c()
for(i in 1:nrow(bgen_ranges)){
tmp <- gout_top %>%
filter(CHR == bgen_ranges$CHR[i] & between(BP, bgen_ranges$BP1[i], bgen_ranges$BP2[i])) %>%
arrange(P) %>%
slice(1)
out <- rbind(out, tmp)
}
gout_top <- out %>%
cbind(bgen_ranges %>% select(-CHR))
rm(tmp, bgen_ranges, out, i)
# Extracting all SNPs at loci from bgen files and converting to plink format -------------------------------
# system(paste0('source ~/.bashrc; parallel "bgenix -g /Volumes/archive/merrimanlab_nobackup/ukbio/EGAD00010001474/ukb_imp_chr{}_v3.bgen -vcf -incl-range ', here("Output/Temp", "bgen_range.txt"), ' | bcftools reheader -h /Volumes/archive/merrimanlab_nobackup/ukbio/EGAD00010001474/bgen_to_vcf/new_header.txt | bcftools annotate --rename-chrs /Volumes/archive/merrimanlab_nobackup/ukbio/EGAD00010001474/bgen_to_vcf/rename_contigs.txt | bgzip -c > ', here("Output/Temp", "chr"), '{}_forclumping.vcf.gz" ::: ', paste(unique(gout_top$CHR), collapse = " ")))
# system(paste0('source ~/.bashrc; parallel "plink1.9b4.9 --vcf ', here("Output/Temp/"), 'chr{}_forclumping.vcf.gz --extract range ', here("Output/Temp/loci_snps.txt"), ' --pheno ', here("Data/GWAS", "gout_gwas_covar.covar"), ' --pheno-name plink_goutaff --update-sex ', here("Data/GWAS", "gout_gwas_keep_ids_w_sex.txt"), ' --geno 0.1 --maf 0.01 --hwe 0.000001 --make-bed --out ', here("Output/Temp/"), 'chr{}_tmp" ::: ', paste(unique(gout_top$CHR), collapse = " ")))
# Reading the bim files into R and converting their identifiers to just the rsid
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), "_tmp.bim")]
for(i in file_names){
assign(i, read_delim(paste0(here("Output/Temp/"), i), delim = "\t", col_names = F))
assign(i, get(i) %>% left_join(loci, (by = c("X1" = "CHR", "X4" = "BP"))) %>% mutate(SNP_clean = case_when(is.na(RSID) ~ SNP_ID, TRUE ~ RSID)))
# assign(paste0(i, "_notequal"), get(i) %>% filter(X2 != SNP)) # all identical
assign(paste0("new_", i), get(i) %>% select(X1, SNP_clean, X3:X6))
write_delim(get(paste0("new_", i)), file = paste0(here("Output/Temp/"), i), delim = "\t", col_names = F)
}
rm(list = ls()[str_detect(ls(), ".bim")], i, file_names)
# Running the conditional GWAS ----------------------------------------
# Split up the plink files to have one locus per file (saves on computational time)
gout_top2 <- gout_top %>%
select(CHR, BP1, BP2, RSID)
for(i in 1:nrow(gout_top2)){
tmp <- gout_top2 %>% slice(i)
write_delim(tmp, file = paste0(here("Output/Temp/"), "extractrange_", tmp$RSID, ".txt"), delim = "\t", col_names = F)
}
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile {1}/Output/Temp/chr{2}_tmp --extract range {1}/Output/Temp/extractrange_{3}.txt --make-bed --out {1}/Output/Temp/{3}" ::: ', paste(rep(here(), nrow(gout_top)), collapse = " "), ' ::: ', paste(gout_top$CHR, collapse = " "), ' ::: ', paste(gout_top$RSID, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel "cat {1}/Output/Temp/{2}.bim | cut -f 2 > {1}/Output/Temp/{2}_snps; split -d -n l/10 {1}/Output/Temp/{2}_snps {1}/Output/Temp/{2}_snps_split" ::: ', here(), ' ::: ', paste(gout_top$RSID, collapse = " ")))
# First round of conditioning
#system(paste0('source ~/.bashrc; parallel "echo {2} >> {1}/Output/Temp/{2}_snps_split{3}" ::: ', here(), ' ::: ', paste(gout_top$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))
system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile {1}/Output/Temp/{2} --extract {1}/Output/Temp/{2}_snps_split{3} --logistic sex --ci 0.95 --covar {1}/Data/GWAS/gout_gwas_covar.covar --covar-name Age,pc1-pc40 --condition {2} --out {1}/Output/Temp/{2}_split{3}" ::: ', here(), ' ::: ', paste(gout_top$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))
# Reading all the outputs into R
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), regex("rs[0-9]+_split[0-9]+.assoc.logistic"))]
for(i in file_names){
assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}
tmp <- c()
for(i in 1:nrow(gout_top)){
tmp3 <- c()
for(j in 0:9){
tmp2 <- get(paste0(gout_top$RSID[i], "_split0", j, ".assoc.logistic"))
tmp3 <- rbind(tmp3, tmp2)
}
assign(paste0(gout_top$RSID[i], "_gwas"), tmp3 %>% na.omit())
tmp3 <- tmp3 %>% select(SNP, P) %>% arrange(P) %>% slice(1)
tmp <- rbind(tmp, tmp3)
}
tmp <- tmp %>%
rename(new_lead = SNP, new_p = P)
gout_top2 <- gout_top %>%
cbind(tmp)
gout_top_resid <- gout_top2 %>%
filter(new_p < 5e-8)
rm(list = ls()[str_detect(ls(), ".assoc")], i, tmp, tmp2, file_names, tmp3, j)
# Second round of conditioning
system(paste0('source ~/.bashrc; parallel "echo {4} >> {1}/Output/Temp/{2}_snps_split{3}" ::: ', here(), ' ::: ', paste(gout_top_resid$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " "), ' ::: ', paste(gout_top_resid$new_lead, collapse = " ")))
system(paste0('source ~/.bashrc; parallel --xapply "echo $', paste0("'{2}\n{3}'"), ' > {1}/Output/Temp/{2}_2" ::: ', here(), ' ::: ', paste(gout_top_resid$RSID, collapse = " "), ' ::: ', paste(gout_top_resid$new_lead, collapse = " ")))
system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile {1}/Output/Temp/{2} --extract {1}/Output/Temp/{2}_snps_split{3} --logistic sex --ci 0.95 --covar {1}/Data/GWAS/gout_gwas_covar.covar --covar-name Age,pc1-pc40 --condition-list {1}/Output/Temp/{2}_2 --out {1}/Output/Temp/{2}_split{3}_2" ::: ', here(), ' ::: ', paste(gout_top_resid$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), regex("rs[0-9]+_split[0-9]+_2.assoc.logistic"))]
for(i in file_names){
assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}
tmp <- c()
for(i in 1:nrow(gout_top_resid)){
tmp3 <- c()
for(j in 0:9){
tmp2 <- get(paste0(gout_top_resid$RSID[i], "_split0", j, "_2.assoc.logistic"))
tmp3 <- rbind(tmp3, tmp2)
}
assign(paste0(gout_top_resid$RSID[i], "_gwas2"), tmp3 %>% na.omit())
tmp3 <- tmp3 %>% select(SNP, P) %>% arrange(P) %>% slice(1)
tmp <- rbind(tmp, tmp3)
}
tmp <- tmp %>%
rename(new_lead2 = SNP, new_p2 = P)
gout_top3 <- gout_top_resid %>%
cbind(tmp)
gout_top_resid2 <- gout_top3 %>%
filter(new_p2 < 5e-8)
rm(list = ls()[str_detect(ls(), ".assoc")], i, tmp, tmp2, file_names, tmp3, j)
# Third round of conditioning
system(paste0('source ~/.bashrc; parallel "echo {4} >> {1}/Output/Temp/{2}_snps_split{3}" ::: ', here(), ' ::: ', paste(gout_top_resid2$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " "), ' ::: ', paste(gout_top_resid2$new_lead2, collapse = " ")))
system(paste0('source ~/.bashrc; parallel --xapply "echo $', paste0("'{2}\n{3}\n{4}'"), ' > {1}/Output/Temp/{2}_3" ::: ', here(), ' ::: ', paste(gout_top_resid2$RSID, collapse = " "), ' ::: ', paste(gout_top_resid2$new_lead, collapse = " "), ' ::: ', paste(gout_top_resid2$new_lead2, collapse = " ")))
system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile {1}/Output/Temp/{2} --extract {1}/Output/Temp/{2}_snps_split{3} --logistic sex --ci 0.95 --covar {1}/Data/GWAS/gout_gwas_covar.covar --covar-name Age,pc1-pc40 --condition-list {1}/Output/Temp/{2}_3 --out {1}/Output/Temp/{2}_split{3}_3" ::: ', here(), ' ::: ', paste(gout_top_resid2$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), regex("rs[0-9]+_split[0-9]+_3.assoc.logistic"))]
for(i in file_names){
assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}
tmp <- c()
for(i in 1:nrow(gout_top_resid2)){
tmp3 <- c()
for(j in 0:9){
tmp2 <- get(paste0(gout_top_resid2$RSID[i], "_split0", j, "_3.assoc.logistic"))
tmp3 <- rbind(tmp3, tmp2)
}
assign(paste0(gout_top_resid2$RSID[i], "_gwas3"), tmp3 %>% na.omit())
tmp3 <- tmp3 %>% select(SNP, P) %>% arrange(P) %>% slice(1)
tmp <- rbind(tmp, tmp3)
}
tmp <- tmp %>%
rename(new_lead3 = SNP, new_p3 = P)
gout_top4 <- gout_top_resid2 %>%
cbind(tmp)
gout_top_resid3 <- gout_top4 %>%
filter(new_p3 < 5e-8)
rm(list = ls()[str_detect(ls(), ".assoc")], i, tmp, tmp2, file_names, tmp3, j)
# Locus zooms ---------------------------------------
# Loading in code and gene list
source(here("Script/Functions/locus_zoom.R"))
UCSC_GRCh37_Genes_UniqueList.txt <- as.data.frame(read_delim(here("Data/GWAS/UCSC_GRCh37_Genes_UniqueList.txt"), delim = "\t"))
# Plotting locus zooms of original GWAS
# Calculating LD
system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top$CHR, collapse = " "), ' ::: ', paste(gout_top$RSID, collapse = " ")))
# Reading the LD reports back into R
for(i in 1:nrow(gout_top)){
assign(paste0("chr", gout_top$CHR[i], "_", gout_top$RSID[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top$CHR[i], "_", gout_top$RSID[i], "_ld.ld")))
}
# Making full list of SNPs for labelling
first_round <- gout_top_resid %>%
select(new_lead) %>%
rename(RSID = new_lead)
second_round <- gout_top_resid2 %>%
select(new_lead2) %>%
rename(RSID = new_lead2)
gout_top_full <- gout_top %>%
select(RSID) %>%
rbind(first_round, second_round) %>%
left_join(loci, by = "RSID") %>%
arrange(CHR, BP)
# Plotting the locus zooms
for(i in 1:nrow(gout_top)){
locus.zoom(data = loci %>% mutate(SNP = RSID) %>% filter(!is.na(SNP), CHR == gout_top$CHR[i] & between(BP, gout_top$BP1[i], gout_top$BP2[i])),
region = c(gout_top$CHR[i], gout_top$BP1[i], gout_top$BP2[i]),
offset_bp = 0,
ld.file = get(paste0("chr", gout_top$CHR[i], "_", gout_top$RSID[i], "_ld")),
genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
plot.title = paste0("Unconditioned ", gout_top$RSID[i], " Locus Zoom"),
file.name = paste0(here("Output/Plots/"), "Chr", gout_top$CHR[i], "_", gout_top$BP1[i], "_", gout_top$BP2[i], "_", gout_top$RSID[i], "_unconditioned", ".jpg"),
secondary.snp = gout_top_full$RSID,
secondary.label = TRUE)
}
# Plotting locus zooms of first round of conditioning
# Remaking LD based on ALL new lead SNPs
system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top2$CHR, collapse = " "), ' ::: ', paste(gout_top2$new_lead, collapse = " ")))
# Reading the LD reports back into R
for(i in 1:nrow(gout_top2)){
assign(paste0("chr", gout_top2$CHR[i], "_", gout_top2$new_lead[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top2$CHR[i], "_", gout_top2$new_lead[i], "_ld.ld")))
}
# Plotting locus zooms
for(i in 1:nrow(gout_top2)){
locus.zoom(data = get(paste0(gout_top2$RSID[i], "_gwas")),
region = c(gout_top$CHR[i], gout_top2$BP1[i], gout_top2$BP2[i]),
offset_bp = 0,
ld.file = get(paste0("chr", gout_top2$CHR[i], "_", gout_top2$new_lead[i], "_ld")),
genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
plot.title = paste0("Conditioned on ", gout_top2$RSID[i]),
file.name = paste0(here("Output/Plots/"), "Chr", gout_top2$CHR[i], "_", gout_top2$BP1[i], "_", gout_top2$BP2[i], "_", gout_top2$RSID[i], "_condition_", gout_top2$RSID[i], ".jpg"),
secondary.snp = gout_top_full$RSID,
secondary.label = TRUE)
}
# Plotting locus zooms of second round of conditioning
# Remaking LD based on new lead SNPs
system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top3$CHR, collapse = " "), ' ::: ', paste(gout_top3$new_lead2, collapse = " ")))
# Reading the LD reports back into R
for(i in 1:nrow(gout_top3)){
assign(paste0("chr", gout_top3$CHR[i], "_", gout_top3$new_lead2[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top3$CHR[i], "_", gout_top3$new_lead2[i], "_ld.ld")))
}
# Plotting locus zooms
for(i in 1:nrow(gout_top3)){
locus.zoom(data = get(paste0(gout_top3$RSID[i], "_gwas2")),
region = c(gout_top3$CHR[i], gout_top3$BP1[i], gout_top3$BP2[i]),
offset_bp = 0,
ld.file = get(paste0("chr", gout_top3$CHR[i], "_", gout_top3$new_lead2[i], "_ld")),
genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
plot.title = paste0("Conditioned on ", gout_top3$RSID[i], " and ", gout_top3$new_lead[i]),
file.name = paste0(here("Output/Plots/"), "Chr", gout_top3$CHR[i], "_", gout_top3$BP1[i], "_", gout_top3$BP2[i], "_", gout_top3$RSID[i], "_condition_", gout_top3$RSID[i], "and", gout_top3$new_lead[i], ".jpg"),
secondary.snp = gout_top_full$RSID,
secondary.label = TRUE)
}
# Plotting locus zooms of third round of conditioning
# Remaking LD based on new lead SNP
system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top4$CHR, collapse = " "), ' ::: ', paste(gout_top4$new_lead3, collapse = " ")))
# Reading the LD reports back into R
for(i in 1:nrow(gout_top4)){
assign(paste0("chr", gout_top4$CHR[i], "_", gout_top4$new_lead3[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top4$CHR[i], "_", gout_top4$new_lead3[i], "_ld.ld")))
}
# Plotting locus zooms
for(i in 1:nrow(gout_top4)){
locus.zoom(data = get(paste0(gout_top4$RSID[i], "_gwas3")),
region = c(gout_top4$CHR[i], gout_top4$BP1[i], gout_top4$BP2[i]),
offset_bp = 0,
ld.file = get(paste0("chr", gout_top4$CHR[i], "_", gout_top4$new_lead3[i], "_ld")),
genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
plot.title = paste0("Conditioned on ", gout_top4$RSID[i], " and ", gout_top4$new_lead[i], " and ", gout_top4$new_lead2[i]),
file.name = paste0(here("Output/Plots/"), "Chr", gout_top4$CHR[i], "_", gout_top4$BP1[i], "_", gout_top4$BP2[i], "_", gout_top4$RSID[i], "_condition_", gout_top4$RSID[i], "and", gout_top4$new_lead[i], "and", gout_top4$new_lead2[i], ".jpg"),
secondary.snp = gout_top_full$RSID,
secondary.label = TRUE)
}
# Combining all GWAS results together into final list ----------------------------------------------
regions <- gout_top %>%
select(CHR, BP1, BP2)
out <- c()
for(i in 1:nrow(regions)){
tmp <- gout_top_full %>%
filter(CHR == regions$CHR[i] & between(BP, regions$BP1[i], regions$BP2[i])) %>%
mutate(BP1 = regions$BP1[i],
BP2 = regions$BP2[i])
out <- rbind(out, tmp)
}
gout_top_full <- out
# For each of the loci with multiple SNPs, I need to test the association of each SNP after adjusting for all others at the locus
# So first lets pull out all SNPs from those loci
multi_snps <- gout_top_full %>%
filter(BP1 %in% names(table(gout_top_full$BP1)[table(gout_top_full$BP1) > 1]))
tmp <- multi_snps %>% select(RSID)
write_delim(tmp, file = here("Output/Temp/snps_to_extract.txt"), delim = "\n", col_names = F)
system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --extract ', here("Output/Temp/snps_to_extract.txt"), ' --make-bed --out ', here("Output/Temp/"), 'chr{1}_test" ::: ', paste(unique(multi_snps$CHR), collapse = " ")))
system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr4_test --bmerge ', here("Output/Temp/"), 'chr11_test --make-bed --out ', here("Output/Temp/"), 'merged_test'))
for(i in 4:7){
system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_test --logistic sex --ci 0.95 --covar ', here("Data/GWAS", "gout_gwas_covar.covar"), ' --covar-name Age,pc1-pc40 --condition ', multi_snps$RSID[i], ' --out ', here("Output/Temp/"), 'final_gwas_', multi_snps$RSID[i]))
}
for(i in 1:3){
write_delim(multi_snps %>% slice(1:3) %>% select(RSID) %>% filter(RSID != multi_snps$RSID[i]), file = paste0(here("Output/Temp/"), "conditionlist_", multi_snps$RSID[i], ".txt"), delim = "\n", col_names = F)
system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_test --logistic sex --ci 0.95 --covar ', here("Data/GWAS", "gout_gwas_covar.covar"), ' --covar-name Age,pc1-pc40 --condition-list ', here("Output/Temp/"), 'conditionlist_', multi_snps$RSID[i], '.txt --out ', here("Output/Temp/"), 'final_gwas_', multi_snps$RSID[i]))
}
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), "final_gwas_.+logistic")]
for(i in file_names){
assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}
tmp <- c()
for(i in 1:3){
tmp2 <- get(paste0("final_gwas_", multi_snps$RSID[i], ".assoc.logistic")) %>% slice(i)
tmp <- rbind(tmp, tmp2)
}
tmp2 <- c()
for(i in 4:7){
tmp3 <- get(paste0("final_gwas_", multi_snps$RSID[i], ".assoc.logistic")) %>% slice(-(1:3))
tmp2 <- rbind(tmp2, tmp3)
}
tmp2 <- tmp2 %>% slice(5, 2, 15, 12)
multi_snps2 <- rbind(tmp, tmp2)
tmp2 <- multi_snps2 %>%
select(CHR:BP, OR:U95, P)
multi_snps3 <- left_join(multi_snps, tmp2, by = c("CHR", "BP"))
system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_test --r2 inter-chr --ld-window-r2 0 --out ', here("Output/Temp/"), 'merged_ld'))
merged_ld <- read_table(paste0(here("Output/Temp/"), "merged_ld.ld"))
# test multicollinearity in SLC2A9 model in next Rmd
single_snps <- gout_top_full %>%
filter(!(BP1 %in% names(table(gout_top_full$BP1)[table(gout_top_full$BP1) > 1])))
single_snps2 <- single_snps %>%
select(CHR, RSID, BP:Alternate_Allele, OR:U95, P, EAF, INFO, BP1, BP2)
multi_snps4 <- multi_snps3 %>%
select(CHR, RSID, BP:Alternate_Allele, OR.y:U95.y, P.y, OR.x:U95.x, P.x, EAF, INFO, BP1, BP2) %>%
rename(OR = OR.y,
SE = SE.y,
L95 = L95.y,
U95 = U95.y,
P = P.y,
OR_old = OR.x,
SE_old = SE.x,
L95_old = L95.x,
U95_old = U95.x,
P_old = P.x)
gout_top_final <- full_join(multi_snps4, single_snps2) %>%
arrange(CHR, BP)
# Flipping allele order + OR etc so effect allele is always the gout risk allele + labelling based on locus zooms
smallOR <- gout_top_final %>%
filter(OR < 1) %>%
mutate(OR = as.numeric(signif(1/OR, digits = 4)),
tmp_L = as.numeric(signif(1/L95, digits = 4)),
tmp_U = as.numeric(signif(1/U95, digits = 4)),
U95 = tmp_L,
L95 = tmp_U,
OR_old = as.numeric(round(1/OR_old, digits = 3)),
tmp_L_old = as.numeric(round(1/L95_old, digits = 3)),
tmp_U_old = as.numeric(round(1/U95_old, digits = 3)),
U95_old = tmp_L_old,
L95_old = tmp_U_old,
EAF = 1 - EAF) %>%
rename(allele2 = Effect_Allele,
allele1 = Alternate_Allele) %>%
rename(Alternate_Allele = allele2,
Effect_Allele = allele1) %>%
select(CHR:BP, Effect_Allele, Alternate_Allele, OR:BP2)
bigOR <- gout_top_final %>%
filter(OR > 1)
gout_top_final <- full_join(smallOR, bigOR) %>%
arrange(CHR, BP) %>%
mutate(Locus_Name = c("PDZK1", "TRIM46", "GCKR", "SFMBT1", "SLC2A9", "SLC2A9", "SLC2A9", "ABCG2", "ABCG2", "SLC17A1", "ZSCAN31", "MLXIPL", "SLC16A9", "SLC22A11", "SLC22A11", "OVOL1", "R3HDM2", "MLXIP", "PNPLA3"))
UKBB_Gene_OR <- gout_top_final
save(UKBB_Gene_OR, file = here("Output/UKBB_Gene_OR.RData"))
# Cleaning up
rm(list = ls()[str_detect(ls(), "^chr|^gout_top|^final_|gwas$")], bigOR, first_round, multi_snps, multi_snps2, multi_snps3, multi_snps4, merged_ld, out, regions, second_round, single_snps, smallOR, third_round, tmp, tmp2, tmp3, file_names, i, sumstat_signif, single_snps2, loci, sumstat_final, UCSC_GRCh37_Genes_UniqueList.txt, check.rsid, elog10, gene.position, get.ld, get.region, locus.zoom, merge.gene.colour, merge.plot.dat, plot.locus, plot.secondary.point, read.plink.loci, round.up, subset.data)
Locus-Zooms
All of the locus zooms are plotted below in separate tabs:
file_names <- list.files(here("Output/Plots"), full.names = T)[str_detect(list.files(here("Output/Plots"), full.names = T), "Chr")]
tmp <- file_names %>%
as_tibble() %>%
separate(value, sep = "_", into = c(NA, "X2", "BP1", NA, NA, "Cond", "CondSNPs")) %>%
rownames_to_column() %>%
mutate(Cond1 = case_when(Cond == "unconditioned.jpg" ~ FALSE, TRUE ~ TRUE),
BP1 = as.numeric(BP1),
rowname = as.numeric(rowname)) %>%
separate(X2, sep = "/", into = c(NA, NA, NA, NA, NA, NA, NA, "CHR")) %>%
mutate(CHR = as.numeric(str_replace(CHR, "Chr", ""))) %>%
arrange(CHR, BP1, Cond1, CondSNPs)
tmp1 <- tmp %>%
pull(BP1) %>%
unique() %>%
as_tibble() %>%
rename(BP1 = value)
load(here("Output/UKBB_Gene_OR.RData"))
tmp2 <- UKBB_Gene_OR %>%
pull(Locus_Name) %>%
unique() %>%
as_tibble() %>%
rename(Locus_Name = value) %>%
cbind(tmp1)
tmp3 <- tmp %>%
left_join(tmp2, by = "BP1")
tmp4 <- tmp3 %>%
mutate(CondSNPs2 = str_remove(CondSNPs, ".jpg")) %>%
separate(CondSNPs2, sep = "and", into = c("SNP1", "SNP2", "SNP3", "SNP4")) %>%
mutate(SNPs = case_when(is.na(SNP2) ~ SNP1,
!is.na(SNP2) & is.na(SNP3) ~ str_c(SNP1, SNP2, sep = " and "),
!is.na(SNP3) & is.na(SNP4) ~ str_c(SNP1, SNP2, SNP3, sep = " and "),
!is.na(SNP4) ~ str_c(SNP1, SNP2, SNP3, SNP4, sep = " and ")),
Plot_Name = case_when(!Cond1 ~ paste0(Locus_Name, " (Uncond.)"),
Cond1 ~ paste0(Locus_Name, " (Cond. on ", SNPs, ")")))
file_names2 <- file_names[tmp$rowname]
names(file_names2) <- tmp4$Plot_Name
template <- c(
"#### {{nm}}\n",
"```{r, echo = FALSE}\n",
"include_graphics(file_names2['{{nm}}'])\n",
"```\n",
"\n"
)
plots <- lapply(
tmp4$Plot_Name,
function(nm) knit_expand(text = template)
)
PDZK1 (Uncond.)

PDZK1 (Cond. on rs10910845)

TRIM46 (Uncond.)

TRIM46 (Cond. on rs11264341)

GCKR (Uncond.)

GCKR (Cond. on rs1260326)

SFMBT1 (Uncond.)

SFMBT1 (Cond. on rs9847710)

SLC2A9 (Uncond.)

SLC2A9 (Cond. on rs7675964)

SLC2A9 (Cond. on rs7675964 and rs6811287)

SLC2A9 (Cond. on rs7675964 and rs6811287 and rs4481233)

ABCG2 (Uncond.)

ABCG2 (Cond. on rs2231142)

ABCG2 (Cond. on rs2231142 and rs10011796)

SLC17A1 (Uncond.)

SLC17A1 (Cond. on rs1165196)

ZSCAN31 (Uncond.)

ZSCAN31 (Cond. on rs853685)

MLXIPL (Uncond.)

MLXIPL (Cond. on rs3812316)

SLC16A9 (Uncond.)

SLC16A9 (Cond. on rs1171616)

SLC22A11 (Uncond.)

SLC22A11 (Cond. on rs17300741)

SLC22A11 (Cond. on rs17300741 and rs7937990)

OVOL1 (Uncond.)

OVOL1 (Cond. on rs4014195)

R3HDM2 (Uncond.)

R3HDM2 (Cond. on rs1106766)

MLXIP (Uncond.)

MLXIP (Cond. on rs28652632)

PNPLA3 (Uncond.)

PNPLA3 (Cond. on rs738409)

Summary
In summary, I produced a list of SNPs that will be used to create a PRS. This uses SNPs genotyped on the Human CoreExome v1.0 chip. This will ensure that no imputation was done on Polynesian individuals, which should make the genotypes more reliable. Three of the 15 total loci had more than one partially independent genome-wide significant signal. These were at SLC2A9 (3 hits), ABCG2 (2 hits), and SLC22A11 (2 hits).
Tin Urate PRS
Based on comments from reviewers, it was requested that I produce a PRS based on the results of Tin et al., 2019. The following code details the following:
CoreExome genotyped plink files were filtered to exclude variants with over 10% missingness and those with MAF less than 0.01 in the entire CoreExome cohort
The locations of these filtered SNPs were extracted from the bim file and this was filtered to only include SNPs that were also in the UK Biobank imputed genotype list
Finally, the Tin et al. European summary statistics were filtered to only keep SNPs matching the chromosome and location of the above SNPs
mkdir -p /Volumes/scratch/merrimanlab/Nick/PRS
cd /Volumes/scratch/merrimanlab/Nick/PRS
# Filter tin sumstats to match both chromosome and bp of CoreEx SNPs plus remove rare SNPs (less than 1% frequency)
plink1.9b6.10 --bfile /Volumes/archive/merrimanlab/raid_backup/New_Zealand_Chip_data/CoreExome/QC_MergedBatches/Final_Data/CZ-MB1.2-QC1.10_CoreExome24-1.0-3_genotyped-QCd_rsIDconverted --make-bed --geno 0.1 --maf 0.01 --out filtered_coreex
cut -f1,4 filtered_coreex.bim | tr '\t' ' ' > coreex_snps.txt
for i in {1..22}
do
grep -w $i coreex_snps.txt | cut -d ' ' -f2 > snplist_chr$i.txt
done
# Now filter for SNPs in the UK Biobank imputed genotype list
parallel "cut -f3 /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/splits/ukb_mfi_chr{}_v3.txt | grep -Fwf snplist_chr{}.txt | sed 's/^/{} /' > in_ukb_chr{}.txt" ::: {1..22}
cat $(ls | grep in_ukb) > snplist.txt
cat <(head -n1 /Volumes/archive/merrimanlab/central_datasets/summary_gwas/urate/Tin_2019/cleaned/urate_chr1_22_LQ_IQ06_mac10_EA_60_rsid.txt) <(awk '$6 < 0.99 && $6 > 0.01' FS=' ' /Volumes/archive/merrimanlab/central_datasets/summary_gwas/urate/Tin_2019/cleaned/urate_chr1_22_LQ_IQ06_mac10_EA_60_rsid.txt | grep -Ff snplist.txt) | sed 's/ /\t/g' > tin_filtered.txt
The filtered Tin summary statistics were read back into R, then further cleaned up to ensure no rare variants remained
The summary statistics were used to define a list of top SNPs in a similar manner to the previous code for the UK Biobank gout GWAS
No conditional analyses were run on this list of summary statistics
In total, 82 common variants were defined as being associated with serum urate in Europeans, all of which were suitable for downstream analysis
tin <- vroom("/Volumes/scratch/merrimanlab/Nick/PRS/tin_filtered.txt",
delim = "\t",
col_names = T)
test <- tin %>%
filter(str_detect(RSID, regex("^rs[0-9]+")))
test2 <- tin %>%
filter(!(RSID %in% test$RSID)) %>%
mutate(RSID = paste0(Chr, Pos_b37, Allele1, Allele2))
tin2 <- rbind(test, test2) %>%
arrange(Chr, Pos_b37) %>%
rename(CHR = Chr,
BP = Pos_b37,
P = `P-value`,
Beta = Effect,
SE = StdErr,
EAF = Freq1,
Effect_Allele = Allele1,
Alternate_Allele = Allele2) %>%
mutate(t = abs(Beta/SE))
# nrow(tin2 %>% select(CHR, BP) %>% unique()) # all unique
sumstat_signif <- tin2 %>%
filter(P <= 5e-8,
EAF > 0.01,
EAF < 0.99) %>%
arrange(desc(t))
# Grouping into loci +- 500 kb of top SNPs
gout_top <- sumstat_signif %>%
slice(1)
gout2 <- sumstat_signif %>%
filter(!(CHR == gout_top$CHR[1] & BP %in% ((gout_top$BP[1] - 500000):(gout_top$BP[1] + 500000))))
while(nrow(gout2) > 0) {
tmp <- gout2 %>%
slice(1)
gout_top <- rbind(tmp, gout_top)
gout2 <- gout2 %>%
filter(!(CHR == gout_top$CHR[1] & BP %in% ((gout_top$BP[1] - 500000):(gout_top$BP[1] + 500000))))
}
gout_top <- gout_top %>%
arrange(CHR, BP)
# Finding regions of loci
sumstat_signif <- sumstat_signif %>%
arrange(CHR, BP)
out <- NA
for(i in 2:nrow(sumstat_signif)) {
if(sumstat_signif$CHR[i] == sumstat_signif$CHR[i - 1]){
out[i] <- sumstat_signif$BP[i] - sumstat_signif$BP[i - 1]
} else {
out[i] <- NA
}
}
tmp <- sumstat_signif %>%
mutate(Diff = out,
Diff2 = case_when(Diff < 500000 ~ Diff))
out <- sumstat_signif %>% slice(1)
for(i in 2:nrow(sumstat_signif)) {
if(is.na(tmp$Diff2[i])){
out <- rbind(out, sumstat_signif %>% slice(i - 1), sumstat_signif %>% slice(i))
}
}
out <- rbind(out, sumstat_signif %>% slice(nrow(sumstat_signif)))
# Extracting regions
bgen_ranges <- out %>% select(CHR, BP)
tmp1 <- bgen_ranges %>% slice(seq(1, nrow(bgen_ranges), by = 2)) %>% rename(BP1 = BP)
tmp2 <- bgen_ranges %>% slice(seq(2, nrow(bgen_ranges), by = 2)) %>% rename(CHR.x = CHR, BP2 = BP)
bgen_ranges <- tmp1 %>%
cbind(tmp2) %>%
mutate(BP1 = BP1 - 50000,
BP2 = BP2 + 50000) %>%
select(-CHR.x)
out <- c()
for(i in 1:nrow(bgen_ranges)){
tmp <- gout_top %>%
filter(CHR == bgen_ranges$CHR[i] & between(BP, bgen_ranges$BP1[i], bgen_ranges$BP2[i])) %>%
arrange(P) %>%
slice(1)
out <- rbind(out, tmp)
}
gout_top <- out %>%
cbind(bgen_ranges %>% select(-CHR))
rm(tmp, bgen_ranges, out, i)
# Flipping allele order + OR etc so effect allele is always the gout risk allele + labelling based on locus zooms
gout_top <- gout_top %>%
mutate(L95 = Beta - 1.96 * SE,
U95 = Beta + 1.96 * SE)
smallOR <- gout_top %>%
filter(Beta < 0) %>%
mutate(Beta = as.numeric(signif(Beta * -1, digits = 4)),
tmp_L = as.numeric(signif(L95 * -1, digits = 4)),
tmp_U = as.numeric(signif(U95 * -1, digits = 4)),
U95 = tmp_L,
L95 = tmp_U,
EAF = 1 - EAF) %>%
rename(allele2 = Effect_Allele,
allele1 = Alternate_Allele) %>%
rename(Alternate_Allele = allele2,
Effect_Allele = allele1) %>%
select(CHR:BP, RSID, Effect_Allele, Alternate_Allele, Beta, L95, U95, SE:BP2)
bigOR <- gout_top %>%
filter(Beta > 0) %>%
mutate(Beta = as.numeric(signif(Beta, digits = 4)),
L95 = as.numeric(signif(L95, digits = 4)),
U95 = as.numeric(signif(U95, digits = 4))) %>%
select(CHR:BP, RSID, Effect_Allele, Alternate_Allele, Beta, L95, U95, SE:BP2)
gout_top_final <- full_join(smallOR, bigOR) %>%
arrange(CHR, BP) %>%
select(-t, -n_total_sum) %>%
mutate(Effect_Allele = toupper(Effect_Allele),
Alternate_Allele = toupper(Alternate_Allele)) #%>%
#mutate(Locus_Name = c("PDZK1", "TRIM46", "GCKR", "SFMBT1", "SLC2A9", "SLC2A9", "SLC2A9", "ABCG2", "ABCG2", "SLC17A1", "ZSCAN31", "MLXIPL", "SLC16A9", "SLC22A11", "SLC22A11", "OVOL1", "R3HDM2", "MLXIP", "PNPLA3"))
Tin_Gene_OR <- gout_top_final
save(Tin_Gene_OR, file = here("Output/Tin_Gene_OR.RData"))
# Cleaning up
rm(list = ls()[str_detect(ls(), "^chr|^gout_top|^final_|gwas$")], bigOR, smallOR, tmp1, tmp2, sumstat_signif, gout2, test, test2, tin, tin2)
Preparing Phenotype files and making PRS
The purpose of this section is to generate cleaned up phenotype files for each cohort, with the polygenic risk score (PRS) included for each. It contains the code for going from the raw phenotype and genotype data (in combination with the SNP lists generated in the previous section) to the finalized data frames for analysis.
The phenotypes of interest are the following (note some may be poorly phenotyped):
Self-reported gout status (i.e. gout vs control)
Self-reported age at collection
Genetically determined sex
Genetic principal components (all 10 global PCs and all 10 Oceanian PCs for Polynesians)
Self-reported age at gout onset
- Disease duration derived from this and age at collection
Self-reported presence of tophi
Self-reported flare frequency (number of flares in the last year)
Serum urate at collection
Self-reported urate lowering therapy data (at collection)
Self-reported gout prophylaxis data (at collection)
Genetic ancestry data (i.e. European vs West Polynesian vs East Polynesian)
Comorbidity data, including BMI, hypertension, diabetes, heart disease (angina, myocardial infarction, or heart failure), kidney disease (serum creatinine/eGFR), dyslipidemia, stroke - including self report, medication, and metrics such as BMI (for descriptive stats table)
Lifestyle factors - total alcohol consumption, sugar-sweetened drink consumption, smoking status (for descriptive stats table)
Self-reported family history of gout
Exclusion criteria:
- Genetic sex to self-report gender mismatch
# Loading PRS SNPs
load(here("Output/UKBB_Gene_OR.RData"))
load(here("Output/Tin_Gene_OR.RData"))
# Extracting SNPs from the CoreExome Plink file ----------------------------------
for_plink <- UKBB_Gene_OR %>%
select(CHR, BP, RSID) %>%
rbind(select(Tin_Gene_OR, CHR:RSID)) %>%
unique() %>%
mutate(BP2 = BP) %>%
select(CHR, BP, BP2, RSID) %>%
arrange(CHR, BP)
write_delim(for_plink, file = here("Output/Temp/UKBB_SNPs_Plink.txt"), col_names = F)
rm(for_plink)
system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile /Volumes/archive/merrimanlab/raid_backup/New_Zealand_Chip_data/CoreExome/QC_MergedBatches/Final_Data/CZ-MB1.2-QC1.10_CoreExome24-1.0-3_genotyped-QCd_rsIDconverted --extract range ', here("Output/Temp/UKBB_SNPs_Plink.txt"), ' --make-bed --out ', here("Output/Temp/SNPs")))
system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'SNPs --recode --out ', here("Output/Temp/"), 'SNPs'))
# Extracting SNPs from the UK Biobank and converting to merged plink file -------------------------------
tmp <- UKBB_Gene_OR %>%
select(CHR, BP) %>%
rbind(select(Tin_Gene_OR, CHR, BP)) %>%
unique() %>%
arrange(CHR, BP)
bgen_range1 <- tmp %>%
filter(CHR < 10) %>%
mutate(BGEN = paste0("0", CHR, ":", BP, "-", BP))
bgen_range2 <- tmp %>%
filter(CHR > 9) %>%
mutate(BGEN = paste0(CHR, ":", BP, "-", BP))
bgen_range <- rbind(bgen_range1, bgen_range2) %>%
arrange(CHR, BP) %>%
select(BGEN)
write_delim(bgen_range, file = here("Output/Temp/PRS_SNPs_BGEN.txt"), delim = "\n", col_names = F)
rm(bgen_range1, bgen_range2, bgen_range)
system(paste0('source ~/.bashrc; parallel "bgenix -g /Volumes/archive/merrimanlab_nobackup/ukbio/EGAD00010001474/ukb_imp_chr{}_v3.bgen -vcf -incl-range ', here("Output/Temp", "PRS_SNPs_BGEN.txt"), ' | bcftools reheader -h /Volumes/archive/merrimanlab_nobackup/ukbio/EGAD00010001474/bgen_to_vcf/new_header.txt | bcftools annotate --rename-chrs /Volumes/archive/merrimanlab_nobackup/ukbio/EGAD00010001474/bgen_to_vcf/rename_contigs.txt | bgzip -c > ', here("Output/Temp", "chr"), '{}_forPRS.vcf.gz" ::: ', paste(unique(tmp$CHR), collapse = " ")))
system(paste0('source ~/.bashrc; parallel "plink1.9b4.9 --vcf ', here("Output/Temp/"), 'chr{}_forPRS.vcf.gz --make-bed --out ', here("Output/Temp/"), 'chr{}_PRS" ::: ', paste(unique(tmp$CHR), collapse = " ")))
write_delim(as_tibble(paste0(here("Output/Temp/"), "chr", unique(tmp$CHR), "_PRS")), file = here("Output/Temp/mergefile_prs.txt"), delim = "\n", col_names = F)
system(paste0('source ~/.bashrc; plink1.9b6.10 --merge-list ', here("Output/Temp/"), 'mergefile_prs.txt --make-bed --out ', here("Output/Temp/"), 'merged_PRS'))
system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_PRS --recode --out ', here("Output/Temp/"), 'merged_PRS'))
# Making phenotype files -----------------------------------------------------------------------------------
if(file.exists(here("Output/Phenotypes.RData"))){
load(here("Output/Phenotypes.RData"))
} else {
# CoreExome QC 1-10 Phenotype file (made by Tanya)
CoreExPheno <- read_delim(here("Data/Phenotypes/CZ-MB1.2-QC1.10_MergedPhenotypes_20082020.txt"), delim = "\t") %>%
mutate(across(where(is_character), factor))
# European cohorts = All Ardea (split into each study), EuroGout (includes EireGout), Gout in Aotearoa (combine with AGRIA + DM + RD + NP + LPA => ANZ cohort)
All_CoreEx_ID <- read_delim(here("Data/Genotypes/CZ-MB1.2-QC1.10_CoreExome24-1.0-3_genotyped-QCd_rsIDconverted.fam"), delim = " ", col_names = F)
CoreExPheno_Euro <- CoreExPheno %>%
filter(Geno.BroadAncestry == "European",
Geno.SampleID %in% All_CoreEx_ID$X2,
General.Use != "No",
!(Pheno.Study %in% c("Auckland Controls", "Australian Controls", "ESR", "Rheumatoid Arthritis")))
# 1,304 NU, 483 HU controls (1,787 total) + 5,161 gout = either GP or ACR or self-report
# Polynesian cohorts = AGRIA, All Ardea, DM, EuroGout, Aotearoa (both new + old), LPA, Ngati Porou, RD => all split into East and West
CoreExPheno_Poly <- CoreExPheno %>%
filter(Geno.BroadAncestry == "Oceanian",
Geno.SampleID %in% All_CoreEx_ID$X2,
General.Use != "No",
!(Pheno.Study %in% c("ESR", "Pacific Trust")))
# 1,021 NU, 248 HU controls (1,269 total) + 1,380 gout
CoreExPheno_Final <- full_join(CoreExPheno_Euro, CoreExPheno_Poly) %>%
filter(Geno.GeneticSex != "Unknown",
!is.na(Pheno.GoutSummary)) %>%
mutate(Pheno.GoutSummary = factor(case_when(Pheno.GoutSummary == "Gout" ~ "Gout",
Pheno.GoutSummary %in% c("Control", "HyperU") ~ "Control")),
across(where(is.factor), factor)) %>%
select(Pheno.SampleID:Pheno.UrateTherapy, GenStudio.ChipType, GenStudio.CallRate:Notes)
rm(CoreExPheno, CoreExPheno_Euro, CoreExPheno_Poly, All_CoreEx_ID)
# Throughout, duration will be derived from onset - age + 1 (because I am deriving it from ages in years, there is up to 1 additional year duration that needs to be accounted for)
# Going through each cohort alphabetically: AGRIA, Ardea - Ironwood, Ardea - LASSO, DM, EuroGout, Gout in Aotearoa, LPA, Ngati Porou, RD
logicfactor <- function(x) {
as.logical(factor(x, levels = c(1, 2), labels = c("FALSE", "TRUE")))
}
# AGRIA
tmp <- CoreExPheno_Final %>%
filter(Pheno.Study == "AGRIA")
agria_pheno <- read_delim(here("Data/Phenotypes/AGRIAPheno.txt"), delim = "\t") %>%
filter(PATIENT %in% tmp$Pheno.SampleID) %>%
left_join(tmp, by = c("PATIENT" = "Pheno.SampleID")) %>%
mutate(across(where(is_character), factor),
across(c(DIABETES:KIDNEY, ALLOP, PROBEN:STEROID, ANTIINFLAM, COLCHI, GPGOUT:SUSTOPHUS, FAMGOUT, FAMGOUT3, CAUGOUTAFFSTAT:TOPHIGOUT, URATELOWERING, FOOD),
logicfactor),
SEX = factor(SEX, labels = c("Male", "Female")),
ETHCLASS = factor(ETHCLASS,
levels = c(1:4, 9),
labels = c("East Polynesian", "West Polynesian", "Caucasian", "Other", "Mixed East/West Polynesian")),
ASPIRATE = factor(ASPIRATE,
levels = 1:3,
labels = c("No", "Yes", "Unknown")),
DIURETICINDUCED = factor(DIURETICINDUCED,
levels = 1:3,
labels = c("No", "Yes", "Unknown")),
TOPHUS = factor(TOPHUS,
levels = 1:3,
labels = c("No", "Yes", "Undetermined")),
TOPHUS = case_when(TOPHUS == "Yes" ~ TRUE, TOPHUS == "No" ~ FALSE, TRUE ~ NA),
ALLOPSIDE = factor(ALLOPSIDE,
levels = 1:3,
labels = c("No", "Yes", "Not taking allopurinol")),
OFFWORK = factor(OFFWORK,
levels = 1:3,
labels = c("No", "Yes", "Not applicable as not currently working")),
ALCOTRIG = factor(ALCOTRIG,
levels = 1:4,
labels = c("No", "Yes", "Unsure", "Non drinker")),
SECONDARYGOUT = factor(SECONDARYGOUT,
levels = 1:4,
labels = c("Primary Gout", "Secondary Gout", "Unknown", "Control")),
SSBCODE = factor(SSBCODE,
levels = 0:5,
labels = c("0/day", "0.1 - 0.99", "1.0 - 1.99", "2.0 - 2.99", "3.0 - 3.99", "4.0 +")),
FRUITCODE = factor(FRUITCODE,
levels = 0:5,
labels = c("0/day", "0.1 - 0.99", "1.0 - 1.99", "2.0 - 2.99", "3.0 - 3.99", "4.0 +")),
DIURETICSUMMARY = factor(DIURETICSUMMARY,
levels = 1:3,
labels = c("Not taking diuretics", "Taking diuretics", "Maybe taking diuretics")),
GOUTSUM = factor(GOUTSUM,
levels = 1:3,
labels = c("Control", "Gout", "May have gout/weak gout"))) %>%
rename(IID = PATIENT,
GOUT = Pheno.GoutSummary) %>%
mutate(SEX = Geno.GeneticSex,
AGESERUM = round(as.duration(interval(DOB, SERUMDATE)) / as.duration(years(1)),
digits = 0),
AGESCL = round(as.duration(interval(DOB, SCLDATE)) / as.duration(years(1)),
digits = 0),
AGE1ATK = case_when(is.na(AGEGOUTDOX) ~ round(as.duration(interval(DOB, GOUTDOXDATE)) / as.duration(years(1)),
digits = 0),
TRUE ~ AGEGOUTDOX),
DURATION = AGECOL - AGE1ATK + 1,
TOPHIGOUT = case_when(COMMENT %in% c("No information, neither tophaceous or aspirate proven, Deceased",
"No information, neither tophaceous or aspirate proven",
"Gout, no tophi",
"No information, neither tophaceous or aspirate proven, lymphoma") ~ FALSE,
TRUE ~ TOPHUS | GOUTCRITERIAB | SUSTOPHUS | COMMENT %in% c("Tophaceous",
"Urate crystals present, tophaceous",
"Aspirate proven, tophacous",
"allopurinol intolerant, febuxostat intolerant, taking benzobromarone. Urate crystals present, tophacious",
"Tophaceous gout",
"Polyarticular tophaceous gout",
"Chronic tophaceous gout")),
EROSIONS = NA,
NUMATK = NA,
URATE1 = round(URATE * 1000 / 59.48, digits = 1),
URATEAGE1 = AGESERUM,
URATE2 = round(SURICACID_SCL * 1000 / 59.48, digits = 1),
URATEAGE2 = AGESCL,
URATE = case_when(!is.na(URATE1) ~ URATE1,
TRUE ~ URATE2),
ULT = case_when(is.na(URATE1) & !is.na(URATE2) ~ NA,
TRUE ~ ALLOP | PROBEN | COMMENT %in% c("allopurinol intolerant, febuxostat intolerant, taking benzobromarone. Urate crystals present, tophacious",
"Allopurinol hypersensitivity, Cholchicine induced diarrhoea, Febuxostat 40mg/day",
"febuxostat 40mg/day; liver toxicity with allopurinol")),
PROPHY = STEROID | ANTIINFLAM | COLCHI,
HYPERTENSION = case_when(!is.na(HIBP) ~ HIBP,
TRUE ~ DIURETICINDUCED == "Yes" | DIURETICSUMMARY == "Maybe taking diuretics"),
TRIGLY = TRIGLY_SCL * 88.57,
CHOLES = CHOLES_SCL * 38.67,
STROKE = NA,
HDL = HDL_SCL * 38.67,
CREAT = CREAT / 88.42,
SCREAT = SCREAT / 88.42,
CREAT2 = rowMeans(across(c(CREAT, SCREAT)), na.rm = TRUE),
EGFR = case_when(SEX == "Male" ~ 175 * (CREAT2 ^ -1.154) * (AGECOL ^ -0.203),
SEX == "Female" ~ 175 * (CREAT2 ^ -1.154) * (AGECOL ^ -0.203) * 0.742),
KIDNEY = KIDNEY | !is.na(KIDNEY2) | EGFR < 60,
TOTALALC = rowSums(across(c(BEER, WINE, SPIRITS)), na.rm = TRUE),
CURSMOKE = NA,
FAMGOUT = FAMGOUT | FAMGOUT3,
FAMGOUTNUM = as.numeric(FAMGOUT4)) %>%
select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
# Ardea - Ironwood (CLEAR1, CLEAR2, CRYSTAL, LIGHT)
# CLEAR1 and CLEAR2 = People who are poor responders to allopurinol (everyone was on ULT at screening yet had > 6mg/dL)
# CRYSTAL = Two groups, 1 = same as CLEAR trials but also had >= 1 tophus, 2 = very HU people not on ULT with at least one tophus
# LIGHT = People who cannot take allopurinol, some may be on other ULT at screening
logicfactor2 <- function(x) {
as.logical(factor(x, levels = c(0, 1), labels = c("FALSE", "TRUE")))
}
tmp <- CoreExPheno_Final %>%
filter(Pheno.Study %in% c("Ardea: CLEAR1", "Ardea: CLEAR2", "Ardea: CRYSTAL", "Ardea: LIGHT"))
ironwood_pheno <- read_delim(here("Data/Phenotypes/ArdeaPheno.txt"), delim = "\t") %>%
filter(SUBJID %in% tmp$Pheno.SampleID) %>%
select(SUBJID, AGE, BRTHDTC, BLWEIGHT, BLHEIGHT, BLBMI, TRT01AN, CONSDT, TRTSDT, ANGINA:HYPERTRIGLY, MI, STROKE, AGFIDDT:GFDUR, CRITBFL, PHNM8FL, ULTALLO:ULTOTH, PLACTOTSTDT:PLACTOTENDT, THIALKFL:PROPHTYPN, TOPHIFN:BLAREA, GFNUM:GFNUMGR, DATESCREENING:EGFRSCREENING, CHOLSCREENING, TRIGSCREENING, URATESCREENING, DATENEG7, URATENEG7, EGFRNEG7, DATEBASELINE, URATEBASELINE, EGFRBASELINE, DATEMONTH1, URATEMONTH1, DATEMONTH2, URATEMONTH2, DATEMONTH3, URATEMONTH3, DATEMONTH4, URATEMONTH4, DATEMONTH5, URATEMONTH5, DATEMONTH6, URATEMONTH6, DATEMONTH8, URATEMONTH8, DATEMONTH10, URATEMONTH10, DATEMONTH12, URATEMONTH12, DATEEARLYTERM, URATEEARLYTERM, DATEFOLLOWUP, URATEFOLLOWUP, CURSMOKE:ALCOHOL, TOPHIGOUT:GOUTNOTES) %>%
left_join(tmp, by = c("SUBJID" = "Pheno.SampleID")) %>%
mutate(across(where(is_character), factor),
across(c(ANGINA:STROKE, CRITBFL:ULTOTH, THIALKFL:PROPHYFL, TOPHIFN, CURSMOKE:ALCOHOL),
logicfactor2),
TRT01AN = factor(TRT01AN,
levels = 0:5,
labels = c("Screen Failure", "Group A (Placebo)", "Group B (Lesinurad 200 mg)", "Group C (Lesinurad 400 mg)", "Not Assigned", "Not Treated"))) %>%
rename(IID = SUBJID,
GOUT = Pheno.GoutSummary,
AGECOL = AGE) %>%
mutate(SEX = Geno.GeneticSex,
AGE1ATK = round(as.duration(interval(ymd(BRTHDTC, truncated = 2L), AGFIDDT)) / as.duration(years(1)),
digits = 0),
DURATION = AGECOL - AGE1ATK + 1,
TOPHIGOUT = TOPHIFN,
EROSIONS = NA,
NUMATK = GFNUM,
URATE = URATESCREENING,
ULT = ULTALLO | ULTPROB | ULTFEBU | ULTOTH | Pheno.Study %in% c("Ardea: CLEAR1", "Ardea: CLEAR2") | (Pheno.Study == "Ardea: CRYSTAL" & URATE < 8),
PROPHY = PROPHYFL,
BMI = BLBMI,
HEART = HEARTFAILURE | MI | ANGINA,
KIDNEY = EGFRSCREENING < 60,
LIPIDS = HYPERCHOLESTEROL | HYPERTRIGLY,
TOTALALC = NA,
SUGDRINK = NA,
FAMGOUT = NA,
FAMGOUTNUM = NA,
EGFR2 = case_when(SEX == "Male" ~ 175 * (SCRSCREENING ^ -1.154) * (AGECOL ^ -0.203),
SEX == "Female" ~ 175 * (SCRSCREENING ^ -1.154) * (AGECOL ^ -0.203) * 0.742)) %>%
select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
# Ardea - LASSO (all on ULT the whole time)
lassopheno1 <- read_delim(here("Data/Phenotypes/ArdeaLassoPhenoFlare.txt"), delim = "\t")
lassopheno2 <- read_delim(here("Data/Phenotypes/ArdeaLassoPhenoLabChem.txt"), delim = "\t")
lassopheno3 <- read_delim(here("Data/Phenotypes/ArdeaLassoPhenoMain.txt"), delim = "\t")
tmp <- full_join(lassopheno3, lassopheno2, by = "SUBJID")
lasso_pheno <- full_join(tmp, lassopheno1, by = "SUBJID")
rm(lassopheno1, lassopheno2, lassopheno3)
tmp <- CoreExPheno_Final %>%
filter(Pheno.Study == "Ardea: 401")
lasso_pheno <- lasso_pheno %>%
filter(DNAID %in% tmp$Pheno.SampleID) %>%
mutate(IID = as.character(DNAID)) %>%
select(IID, AGE, BRTHDTC, GFNUM:ULTOSCR, BLBMI:PROPHTYP, AGFIDDT:GFDUR, TOLOCL:GFDTDURL, ANGINA:RENALIMPAIR, MI:STROKE, SCREENGFSTDT, SCREENGFENDT, SCREENGFOUT, SCREENGFSEV, SCREENPAIN, SCREENGFDUR:SCREENPAIN2, SCREENGFSTRSTP, SCREENLBDT, SCREENALT, SCREENCREAT, SCREENGGT, SCREENURATE, BASELINELBDT, BASELINEURATE, BASET1LBDT, BASET1URATE, BASET2LBDT, BASET2URATE, BASET3LBDT, BASET3URATE, MONTH1LBDT, MONTH1URATE, MONTH1T1LBDT, MONTH1T1URATE, MONTH1T2LBDT, MONTH1T2URATE, MONTH2LBDT, MONTH2URATE, MONTH2T1LBDT, MONTH2T1URATE, MONTH3LBDT, MONTH3URATE, MONTH3T1LBDT, MONTH3T1URATE, MONTH3T3LBDT, MONTH3T3URATE, MONTH4LBDT, MONTH4URATE, MONTH4T1LBDT, MONTH4T1URATE, MONTH5LBDT, MONTH5URATE, MONTH6LBDT, MONTH6URATE, UNSCHEDLBDT, UNSCHEDURATE, EARLYTERMLBDT, EARLYTERMURATE) %>%
left_join(tmp, by = c("IID" = "Pheno.SampleID")) %>%
rename(GOUT = Pheno.GoutSummary,
AGECOL = AGE,
NUMATK = GFNUM,
SEX = Geno.GeneticSex,
BMI = BLBMI) %>%
mutate(across(where(is_character), factor),
across(c(TOHANDFL:ULTOSCR, BLCDFL, ANGINA:RENALIMPAIR, MI:STROKE),
logicfactor2),
AGE1ATK = round(as.duration(interval(ymd(BRTHDTC, truncated = 2L), AGFIDDT)) / as.duration(years(1)),
digits = 0),
DURATION = AGECOL - AGE1ATK + 1,
TOPHIGOUT = BLTPHFN,
EROSIONS = NA,
URATE = SCREENURATE,
ULT = ALLOSCR | ULTOSCR | SCREENURATE < 8,
PROPHY = PROPHTYP %in% c("Both", "Colchicine", "NSAID"),
HEART = ANGINA | MI,
EGFR = case_when(SEX == "Male" ~ 175 * (SCREENCREAT ^ -1.154) * (AGECOL ^ -0.203),
SEX == "Female" ~ 175 * (SCREENCREAT ^ -1.154) * (AGECOL ^ -0.203) * 0.742),
KIDNEY = RENALIMPAIR | EGFR < 60,
LIPIDS = HYPERCHOLESTEROL | HYPERTRIGLY,
TOTALALC = NA,
SUGDRINK = NA,
CURSMOKE = NA,
FAMGOUT = NA,
FAMGOUTNUM = NA) %>%
select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
# Ardea - Other (232 other Ardea study participants (594 and 3170)) - asked ruth if they have separate pheno files - she said we aren't allowed to use them - Tony suggested I just say Tony said why not use them
# They don't have phenotypes of interest so no point including them
# tmp <- CoreExPheno_Euro_Gout %>%
# filter(str_detect(Pheno.Study, "Ardea: 3170") | str_detect(Pheno.Study, "Ardea: 594"))
#
# other_ardea_euro_pheno <- read_delim(here("Data/Phenotypes/ArdeaPheno.txt"), delim = "\t") %>%
# filter(SUBJID %in% tmp$Pheno.SampleID)
#
# other_ardea_euro_pheno2 <- read_delim(here("Data/Phenotypes/ArdeaLassoPhenoMain.txt"), delim = "\t") %>%
# filter(DNAID %in% tmp$Pheno.SampleID | SUBJID %in% tmp$Pheno.SampleID)
#
# rm(tmp, other_ardea_euro_pheno, other_ardea_euro_pheno2)
# DM
tmp <- CoreExPheno_Final %>%
filter(Pheno.Study == "Diabetes Mellitus")
dm_pheno <- read_delim(here("Data/Phenotypes/DMPheno.txt"), delim = "\t") %>%
filter(PATIENT %in% tmp$Pheno.SampleID) %>%
select(PATIENT, DOB, DATECOL, AGECOL, DIABETES:DIABETESTREAT, FAMGOUT:HIBPTREAT, LIPIDS, HEART:STROKE, KIDNEY:KIDNEY2, SUGDRINK, SMOKER:OTHALCO, WEIGHT, HEIGHT, BMI, URATE:CREAT, DIURETIC:OTHDIURETIC, LIPIDLOWER:BILEACIDSEQ, COMMENT, GOUTCRITERIAB, SUSTOPHUS:OTHDRUG, URATEDOX:DATEDOX, DIABETESAFFSTAT, KIDNEYTRANSPLANT, RENALDISEASE, FASTING:TRIGLY, SURICACID:EGFR) %>%
left_join(tmp, by = c("PATIENT" = "Pheno.SampleID")) %>%
rename(IID = PATIENT,
GOUT = Pheno.GoutSummary,
SEX = Geno.GeneticSex) %>%
mutate(across(where(is_character), factor),
across(c(DIABETES, FAMGOUT, FAMGOUT3, HIBP, LIPIDS, HEART:STROKE, KIDNEY, DIURETIC:OTHDIURETIC, LIPIDLOWER:BILEACIDSEQ, GOUTCRITERIAB, SUSTOPHUS, TOPHUS, ALLOP:COLCHI, DIABETESAFFSTAT, KIDNEYTRANSPLANT, RENALDISEASE), logicfactor),
DURATION = AGECOL - AGE1ATK + 1,
TOPHIGOUT = TOPHUS | GOUTCRITERIAB | SUSTOPHUS,
EROSIONS = NA,
URATE = case_when(!is.na(SURICACID) ~ SURICACID * 1000 / 59.48,
!is.na(URATE) ~ URATE * 1000 / 59.48,
TRUE ~ URATEDOX * 1000 / 59.48),
ULT = ALLOP | PROBEN,
PROPHY = STEROID | ANTIINFLAM | COLCHI | OTHDRUG != "no",
HEIGHT = HEIGHT / 100,
BMI = case_when(!is.na(BMI) ~ BMI,
TRUE ~ WEIGHT / (HEIGHT * HEIGHT)),
HYPERTENSION = HIBP | !is.na(HIBPTREAT) | DIURETIC | LOOPDIURETIC | THIAZIDEDIURETIC | OTHDIURETIC | DIURGOUT,
DIABETES = DIABETES | !is.na(DIABETESTREAT) | DIABETESAFFSTAT,
HEART = HEART | ANGINA | HEARTFAILURE | HEARTSURGERY | HEARTATTACK,
CREAT = CREAT / 88.42,
SCREAT = SCREAT / 88.42,
CREAT2 = rowMeans(across(c(CREAT, SCREAT)), na.rm = T),
EGFR = case_when(SEX == "Male" ~ 175 * (CREAT2 ^ -1.154) * (AGECOL ^ -0.203),
SEX == "Female" ~ 175 * (CREAT2 ^ -1.154) * (AGECOL ^ -0.203) * 0.742,
TRUE ~ EGFR),
KIDNEY = KIDNEY | !is.na(KIDNEY2) | EGFR < 60 | KIDNEYTRANSPLANT | RENALDISEASE,
LIPIDS = LIPIDS | LIPIDLOWER | STATIN | FIBRATES | EZETIMIBE | NICOTINICACID | BILEACIDSEQ,
STROKE = STROKE,
TOTALALC = rowSums(across(c(BEER, WINE, SPIRITS)), na.rm = TRUE),
SUGDRINK = SUGDRINK,
CURSMOKE = SMOKER == 2,
FAMGOUT = FAMGOUT | FAMGOUT3,
FAMGOUTNUM = FAMGOUT4) %>%
select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
# EuroGout
tmp <- CoreExPheno_Final %>%
filter(Pheno.Study == "EuroGout")
eurogout_pheno <- read_delim(here("Data/Phenotypes/EuroGoutPheno.txt"), delim = "\t") %>%
filter(SUBJECT %in% tmp$Pheno.SampleID) %>%
select(SUBJECT, RECRUITMENTDATE, DOB:WEIGHT, HEIGHT, BMI, TOPHUS:GOUTNOTES, ACRB, ACRC8, RENALDISEASE, T2DIABETES:HEARTFAILURE, MEDICALCOMMENT, URATETHERAPY:ALLOPURINOL, CHOLCHICINE:TLDIURETICS, ASPRIN, SUGARDRINK:FRUITJUICE, ALCOHOL:PREUTLKURATE, TCHOLESTEROL:TRIGLYCERIDES, EGFR) %>%
left_join(tmp, by = c("SUBJECT" = "Pheno.SampleID")) %>%
rename(IID = SUBJECT,
GOUT = Pheno.GoutSummary,
AGECOL = AGERECRUITMENT,
SEX = Geno.GeneticSex) %>%
mutate(across(where(is_character), factor),
across(c(TOPHUS, EROSIONS, ACRB, ACRC8, RENALDISEASE, T2DIABETES, HYPERTENSION, DYSLIPIDEMIA, STROKE:HEARTFAILURE, ALLOPURINOL:ASPRIN),
logicfactor2),
AGE1ATK = case_when(!is.na(AGEFIRSTATTK) ~ AGEFIRSTATTK,
TRUE ~ AGECOL - DURATIONGOUT),
DURATION = AGECOL - AGE1ATK + 1,
NUMATK = case_when(!is.na(NUMATTACKS) ~ NUMATTACKS,
NUMATTACKS_TXT == ">5" ~ 5,
NUMATTACKS_TXT == "1" ~ 1,
NUMATTACKS_TXT == "2" ~ 2,
NUMATTACKS_TXT == "3" ~ 3,
NUMATTACKS_TXT %in% c("3 to 5", "3-5") ~ 4,
NUMATTACKS_TXT %in% c("reported 'continue' I think. I assume this means ongoing.", "reported 100.") ~ 52,
NUMATTACKS_TXT == "zehn" ~ 10),
TOPHIGOUT = TOPHUS | NUMTOPHI %in% 1:3 | ACRB | ACRC8,
URATE = case_when(is.na(SERUMURATE) ~ PREUTLKURATE * 1000 / 59.48,
TRUE ~ SERUMURATE * 1000 / 59.48),
ULT = GOUTNOTES == "Gout assumed, taking allopurinol" | (!is.na(URATETHERAPY) & !(URATETHERAPY %in% c("diet", "NIL", "no", "No uric acid lowering therapy", "none", "None", "NONE", "none listed", "Unclear"))) | ALLOPURINOL,
PROPHY = CHOLCHICINE | NSAIDS | ASPRIN,
HEIGHT = HEIGHT / 100,
BMI = case_when(!is.na(BMI) ~ BMI,
TRUE ~ WEIGHT / (HEIGHT * HEIGHT)),
HYPERTENSION = HYPERTENSION | !is.na(HYPERTENTREATM) | MEDICALCOMMENT == "Said no to hypertension but beside BP states is on losartan" | DIURETICS | TLDIURETICS,
DIABETES = T2DIABETES | !is.na(T2DTREATMENT),
HEART = MI | IHD | HEARTFAILURE | MEDICALCOMMENT %in% c("Cardiovascular disease", "Heart problems", "Heart problems. EGFR available", "Heart problems. EGFR available. EGFR available. EGFR<60"),
CREAT = SERUMCREATININE / 88.42,
EGFR = case_when(SEX == "Male" ~ 175 * (CREAT ^ -1.154) * (AGECOL ^ -0.203),
SEX == "Female" ~ 175 * (CREAT ^ -1.154) * (AGECOL ^ -0.203) * 0.742,
TRUE ~ EGFR),
KIDNEY = RENALDISEASE | EGFR < 60,
LIPIDS = DYSLIPIDEMIA | !is.na(LIPIDTREATMENT),
STROKE = STROKE,
TOTALALC = ALCOHOL,
SUGDRINK = SUGARDRINK + FRUITJUICE,
CURSMOKE = SMOKER == 1,
FAMGOUT = FAMILYHISTORY == 1,
FAMGOUTNUM = NUMFAMILYGOUT) %>%
select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
# Gout in Aotearoa
tmp <- CoreExPheno_Final %>%
filter(Pheno.Study == "Gout in Aotearoa")
aotearoa_pheno <- read_delim(here("Data/Phenotypes/NZPheno.txt"), delim = "\t") %>%
filter(SUBJECT %in% tmp$Pheno.SampleID) %>%
select(SUBJECT, DATEARR, DOB, AGECOL, DIABETES, FAMGOUT, FAMGOUT3:HIBP, HIBPTREAT:FRUSEMIDE, BUMETANIDE, THIAZIDEDIURETIC:BENDROFLUAZIDE, HCTZ, METOLAZONE, CHLORHALIDONE, INDAPAMIDE, OTHDIURETIC, SPIRONOLACTONE, AMILORIDE, ACETAZOLAMIDE, DIURETICCOMMENT:DIURRECRUIT, LIPIDS, LIPIDLOWER:BILEACIDSEQ, HEART:STROKE, KIDNEY:HEALTHOTH, SUGDRINK, SMOKER:OTHALCO, WEIGHT:HEIGHT, BMI:BMICALC, MRURATE:MRCREATDATE, GOUTCRITERIAB, SUSTOPHUS:DIURGOUT, ALLOPCURRENT, PROBENCURRENT, BENZBROCURRENT, FEBUXCURRENT, OTHULTCURRENT, CURULTCOMMENT:ALLOPINTOLERANCE, ALLOPSIDE, URATEDOX:HIGHESTSUDATE, CHOLES:TRIGLY, SCREAT:SURICACID, URATE1MONTH, RELATEDFILTER:RELATED) %>%
left_join(tmp, by = c("SUBJECT" = "Pheno.SampleID")) %>%
rename(IID = SUBJECT,
GOUT = Pheno.GoutSummary,
SEX = Geno.GeneticSex) %>%
mutate(across(where(is_character), factor),
across(c(FAMGOUT, FAMGOUT3, HIBP, DIURETIC:ACETAZOLAMIDE, LIPIDS:KIDNEY, FATTYLIVER, GOUTCRITERIAB:SUSTOPHUS, TOPHUS, ALLOPCURRENT:OTHULTCURRENT, ALLOPINTOLERANCE),
logicfactor),
AGE1ATK = case_when(!is.na(AGE1ATK) ~ AGE1ATK,
TRUE ~ AGECOL - DURATION),
DURATION = AGECOL - AGE1ATK + 1,
NUMATK = NUMATK,
TOPHIGOUT = GOUTCRITERIAB | SUSTOPHUS | TOPHUS,
EROSIONS = NA,
URATE = case_when(is.na(SURICACID) ~ rowMeans(across(c(MRURATE, URATEDOX, PREULTURATE, HIGHESTSU, URATE1MONTH)), na.rm = TRUE) * 1000 / 59.48,
TRUE ~ SURICACID * 1000 / 59.48),
ULT = ALLOPCURRENT | PROBENCURRENT | BENZBROCURRENT | FEBUXCURRENT | OTHULTCURRENT,
PROPHY = NA,
HEIGHT = HEIGHT / 100,
BMI = case_when(!is.na(BMI) ~ BMI,
TRUE ~ WEIGHT / (HEIGHT * HEIGHT)),
HYPERTENSION = HIBP | !is.na(HIBPTREAT) | DIURETIC | DIURETICCURRENT | LOOPDIURETIC | FRUSEMIDE | BUMETANIDE | THIAZIDEDIURETIC | BENDROFLUAZIDE | HCTZ | METOLAZONE | CHLORHALIDONE | INDAPAMIDE | OTHDIURETIC | SPIRONOLACTONE | AMILORIDE | ACETAZOLAMIDE | !is.na(DIURETICCOMMENT) | DIURRECRUIT == 2 | DIURGOUT %in% 2:4,
DIABETES = DIABETES == 2,
HEART = HEART | ANGINA | HEARTFAILURE | HEARTSURGERY | HEARTATTACK,
CREAT = rowMeans(across(c(SCREAT, MRCREAT))) / 88.42,
EGFR = case_when(SEX == "Male" ~ 175 * (CREAT ^ -1.154) * (AGECOL ^ -0.203),
SEX == "Female" ~ 175 * (CREAT ^ -1.154) * (AGECOL ^ -0.203) * 0.742),
KIDNEY = KIDNEY | !is.na(KIDNEY2) | EGFR < 60,
LIPIDS = LIPIDS | LIPIDLOWER | STATIN | FIBRATES | EZETIMIBE | NICOTINICACID | BILEACIDSEQ,
STROKE = STROKE,
TOTALALC = rowSums(across(c(BEER, WINE, SPIRITS)), na.rm = TRUE),
SUGDRINK = SUGDRINK,
CURSMOKE = SMOKER == 2,
FAMGOUT = FAMGOUT,
FAMGOUTNUM = FAMGOUT4) %>%
select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
# LPA (don't have any phenotypes of interest for this study)
tmp <- CoreExPheno_Final %>%
filter(Pheno.Study == "LPA")
lpa_pheno <- read_delim(here("Data/Phenotypes/LPAPheno.txt"), delim = "\t") %>%
filter(SUBJECT %in% tmp$Pheno.SampleID) %>%
select(SUBJECT:AGE, SMOKING, SMOKEHISTORY, SUGARDRINKS:DIABETESTYPE, MAINHYPERTENSION:DYSLIPIDCOMMENT, MAINSTROKE:MAINSTROKECOM, BMHEIGHT:BMWEIGHT, SERUMCREATININE:SERUMURATE, TOTALCHOLESTEROL, TRIGLYCERIDES) %>%
left_join(tmp, by = c("SUBJECT" = "Pheno.SampleID")) %>%
rename(IID = SUBJECT,
GOUT = Pheno.GoutSummary,
SEX = Geno.GeneticSex,
AGECOL = AGE) %>%
mutate(across(where(is_character), factor),
across(c(SMOKING:SMOKEHISTORY, MAINDIABETES, MAINHYPERTENSION, DYSLIPIDEMIA),
logicfactor),
AGE1ATK = NA,
DURATION = NA,
NUMATK = NA,
TOPHIGOUT = NA,
EROSIONS = NA,
URATE = SERUMURATE,
ULT = NA,
PROPHY = NA,
HEIGHT = BMHEIGHT / 100,
BMI = BMWEIGHT / (HEIGHT * HEIGHT),
HYPERTENSION = MAINHYPERTENSION,
DIABETES = DIABETESTYPE == 2,
HEART = NA,
CREAT = SERUMCREATININE / 88.42,
EGFR = case_when(SEX == "Male" ~ 175 * (SERUMCREATININE ^ -1.154) * (AGECOL ^ -0.203),
SEX == "Female" ~ 175 * (SERUMCREATININE ^ -1.154) * (AGECOL ^ -0.203) * 0.742),
KIDNEY = EGFR < 60,
LIPIDS = DYSLIPIDEMIA,
STROKE = MAINSTROKE,
TOTALALC = NA,
SUGDRINK = SUGARDRINKS,
CURSMOKE = SMOKING,
FAMGOUT = NA,
FAMGOUTNUM = NA) %>%
select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
# Ngati Porou
tmp <- CoreExPheno_Final %>%
filter(Pheno.Study == "Ngati Porou")
nph_pheno <- read_delim(here("Data/Phenotypes/NPHPheno.txt"), delim = "\t") %>%
filter(PATIENT %in% tmp$Pheno.SampleID) %>%
select(PATIENT, DOB, CONSENT, DATEARR, AGECOL, DIABETES, FAMGOUT:HIBP, LIPIDS, LIPIDLOWER:STROKE, KIDNEY, SUGDRINK, SMOKER:SPIRITS, WEIGHT:HEIGHT, BMI, URATE:CREATDATE, DIURETICCURRENT:FRUSEMIDE, BUMETANIDE, BENDROFLUAZIDE, HCTZ, METOLAZONE, CHLORHALIDONE, SPIRONOLACTONE, AMILORIDE, COMMENT, GOUTCRITERIAB, SUSTOPHUS, AGE1ATK:ALLOP, STEROID:OTHDRUG, URATEDOX:DATEDOX, RENALTRANSPLANT, DIABETESAFFSTAT, SURICACID:SCREAT, DIURETIC:OTHDIURETIC, STATIN:BILEACIDSEQ, URATELOWERING) %>%
left_join(tmp, by = c("PATIENT" = "Pheno.SampleID")) %>%
rename(IID = PATIENT,
GOUT = Pheno.GoutSummary,
SEX = Geno.GeneticSex) %>%
mutate(across(where(is_character), factor),
across(c(DIABETES, FAMGOUT, FAMGOUT3, HIBP:STROKE, KIDNEY, DIURETICCURRENT:AMILORIDE, GOUTCRITERIAB, SUSTOPHUS, TOPHUS, ALLOP:BENZOBROMARONE, RENALTRANSPLANT, DIABETESAFFSTAT, DIURETIC:URATELOWERING),
logicfactor),
AGE1ATK = case_when(!is.na(AGE1ATK) ~ AGE1ATK,
TRUE ~ AGECOL - DURATION),
DURATION = AGECOL - AGE1ATK + 1,
NUMATK = NUMATK,
TOPHIGOUT = GOUTCRITERIAB | SUSTOPHUS | TOPHUS,
EROSIONS = NA,
URATE = case_when(is.na(SURICACID) ~ rowMeans(across(c(URATE, URATEDOX)), na.rm = TRUE) * 1000 / 59.48,
TRUE ~ SURICACID * 1000 / 59.48),
ULT = ALLOP | PROBEN | BENZOBROMARONE | URATELOWERING,
PROPHY = STEROID | ANTIINFLAM | COLCHI,
HEIGHT = HEIGHT / 100,
BMI = WEIGHT / (HEIGHT * HEIGHT),
HYPERTENSION = HIBP | DIURETICCURRENT | FRUSEMIDE | BUMETANIDE | BENDROFLUAZIDE | HCTZ | METOLAZONE | CHLORHALIDONE | SPIRONOLACTONE | AMILORIDE | DIURGOUT %in% 2:4 | DIURETIC | LOOPDIURETIC | THIAZIDEDIURETIC | OTHDIURETIC,
DIABETES = DIABETES | DIABETESAFFSTAT,
HEART = HEART | ANGINA | HEARTFAILURE | HEARTSURGERY | HEARTATTACK,
CREAT = rowMeans(across(c(CREAT, SCREAT)), na.rm = TRUE) / 88.42,
EGFR = case_when(SEX == "Male" ~ 175 * (CREAT ^ -1.154) * (AGECOL ^ -0.203),
SEX == "Female" ~ 175 * (CREAT ^ -1.154) * (AGECOL ^ -0.203) * 0.742),
KIDNEY = KIDNEY | EGFR < 60 | RENALTRANSPLANT,
LIPIDS = LIPIDS | LIPIDLOWER | STATIN | FIBRATES | EZETIMIBE | NICOTINICACID | BILEACIDSEQ,
STROKE = STROKE,
TOTALALC = rowSums(across(c(BEER, WINE, SPIRITS)), na.rm = TRUE),
SUGDRINK = SUGDRINK,
CURSMOKE = SMOKER == 2,
FAMGOUT = FAMGOUT,
FAMGOUTNUM = FAMGOUT4) %>%
select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
# RD
tmp <- CoreExPheno_Final %>%
filter(Pheno.Study == "Renal Disease")
rd_pheno <- read_delim(here("Data/Phenotypes/RDPheno.txt"), delim = "\t") %>%
filter(PATIENT %in% tmp$Pheno.SampleID) %>%
select(PATIENT, DOB, CONSENTDATE, DATECOL, DATEARR, CKDV, RENALTRANSPLANT, DIABETES, FAMGOUT, HYPERTENSION, DYSLIPIDAEMIA, IHD, CVA, CHF, HEALTHOTH:WEIGHT, BMI, SMOKER, SUGDRINK, BEER:SPIRITS, COMMENT, TYPE2D, GOUTCRITERIAB, SUSTOPHUS, AGE1ATK:OTHDRUG, ESSENTIALHYPERT, SURICACID:SCREAT, RCOMMENTS) %>%
left_join(tmp, by = c("PATIENT" = "Pheno.SampleID")) %>%
rename(IID = PATIENT,
GOUT = Pheno.GoutSummary,
SEX = Geno.GeneticSex) %>%
mutate(across(where(is_character), factor),
across(c(RENALTRANSPLANT:CHF, TYPE2D:SUSTOPHUS, TOPHUS, ALLOPURINOL:RASBURICASE),
logicfactor),
AGECOL = AGECOL,
AGE1ATK = AGE1ATK,
DURATION = AGECOL - AGE1ATK + 1,
NUMATK = NA,
TOPHIGOUT = GOUTCRITERIAB | SUSTOPHUS | TOPHUS,
EROSIONS = NA,
URATE = case_when(is.na(SURICACID) ~ rowMeans(across(c(URATEFIRSTREC, URATEDOX, URATERECENT)), na.rm = TRUE) * 1000 / 59.48,
TRUE ~ SURICACID * 1000 / 59.48),
ULT = ALLOPURINOL | PROBEN | RASBURICASE,
PROPHY = STEROID | ANTIINFLAM | COLCHI,
HEIGHT = HEIGHT / 100,
BMI = WEIGHT / (HEIGHT * HEIGHT),
HYPERTENSION = HYPERTENSION | ESSENTIALHYPERT == 1 | DIURGOUT %in% 2:4,
DIABETES = DIABETES | TYPE2D,
HEART = IHD | CHF,
EGFR = case_when(SEX == "Male" ~ 175 * (SCREAT / 88.42) ^ -1.154 * (AGECOL ^ -0.203),
SEX == "Female" ~ 175 * (SCREAT / 88.42) ^ -1.154 * (AGECOL ^ -0.203) * 0.742),
KIDNEY = CKDV == 1 | RENALTRANSPLANT | EGFR < 60,
LIPIDS = DYSLIPIDAEMIA,
STROKE = CVA,
TOTALALC = rowSums(across(c(BEER, WINE, SPIRITS)), na.rm = TRUE),
SUGDRINK = SUGDRINK,
CURSMOKE = SMOKER == 2,
FAMGOUT = FAMGOUT,
FAMGOUTNUM = NA) %>%
select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
# Combining all cohorts together
all_pheno <- rbind(agria_pheno, aotearoa_pheno, dm_pheno, eurogout_pheno, ironwood_pheno, lasso_pheno, lpa_pheno, nph_pheno, rd_pheno) %>%
mutate(Pheno.Study = factor(Pheno.Study)) %>%
arrange(IID) %>%
filter(!(duplicated(IID) | duplicated(IID, fromLast = TRUE)))
rm(aotearoa_pheno, agria_pheno, dm_pheno, nph_pheno, rd_pheno, eurogout_pheno, ironwood_pheno, lasso_pheno, tmp, lpa_pheno, CoreExPheno_Final)
# UK Biobank
# Location of the phenotype data
ukbb_dir <- path("/Volumes/archive/merrimanlab/raid_backup/UKbiobank/")
## Loading data
# Latest UKBB phenotype file (refresh_ukbb_data)
load(path(ukbb_dir,'decrypted_files/ukb27189_27190_27191_27192_27193_27194_27195_27640_30070_31460_combined_withdrawn_ids_removed_10-07-2019.RData'))
test <- refresh_ukbb_data %>%
select(eid, body_mass_ind_bmi_f21001_0_0, alcohol_intake_frequency_f1558_0_0, current_tobacco_smoking_f1239_0_0)
load("/Volumes/userdata/student_users/nicksumpter/Documents/PhD/Cluster/Temp/self_report_med.RData")
tmp <- self_report_med %>%
mutate(IID = eid,
ULT = allopurinol | sulphinpyrazone | probenecid) %>%
select(IID, ULT)
load("/Volumes/userdata/student_users/nicksumpter/Documents/PhD/Cluster/Temp/final_data.RData")
ukbb_pheno <- final_data %>%
mutate(eid = as.numeric(eid)) %>%
left_join(test, by = "eid") %>%
rename(IID = eid,
GOUT = gout,
AGECOL = age,
SEX = sex,
URATE = urate) %>%
mutate(Geno.PCVector1 = NA,
Geno.PCVector2 = NA,
Geno.PCVector3 = NA,
Geno.PCVector4 = NA,
Geno.PCVector5 = NA,
Geno.PCVector6 = NA,
Geno.PCVector7 = NA,
Geno.PCVector8 = NA,
Geno.PCVector9 = NA,
Geno.PCVector10 = NA,
Geno.PCVector1_Oc = NA,
Geno.PCVector2_Oc = NA,
Geno.PCVector3_Oc = NA,
Geno.PCVector4_Oc = NA,
Geno.PCVector5_Oc = NA,
Geno.PCVector6_Oc = NA,
Geno.PCVector7_Oc = NA,
Geno.PCVector8_Oc = NA,
Geno.PCVector9_Oc = NA,
Geno.PCVector10_Oc = NA,
AGE1ATK = NA,
DURATION = NA,
TOPHIGOUT = NA,
EROSIONS = NA,
NUMATK = NA,
PROPHY = NA,
Geno.SpecificAncestry = "European",
BMI = body_mass_ind_bmi_f21001_0_0,
HYPERTENSION = hypertension,
DIABETES = type2_diabetes,
HEART = coronary_heart_disease | heart_failure,
KIDNEY = ckd_stage3 | ckd_stage4 | end_stage_renal,
LIPIDS = dyslipidemia,
STROKE = cerebrovascular_disease,
TOTALALC = case_when(alcohol_intake_frequency_f1558_0_0 == "Daily or almost daily" ~ 14,
alcohol_intake_frequency_f1558_0_0 == "Three or four times a week" ~ 4,
alcohol_intake_frequency_f1558_0_0 == "Once or twice a week" ~ 2,
TRUE ~ NA_real_),
SUGDRINK = NA,
CURSMOKE = current_tobacco_smoking_f1239_0_0 == "Yes, on most or all days",
FAMGOUT = NA,
FAMGOUTNUM = NA,
Geno.SampleID = NA,
Pheno.Study = "UK Biobank") %>%
left_join(tmp) %>%
select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
save(all_pheno, ukbb_pheno, file = here("Output/Phenotypes.RData"))
}
Individuals with KING kinship coefficient of > 0.177 (i.e. first degree related or more) were removed from the analysis, preferentially keeping gout cases over controls where possible.
# Use the KING algorithm to find all pairwise relationships that are above 0.177 in the CoreExome data (just based on the 9157 IDs that are in all_pheno)
# Need to prioritize the keeping of gout cases over controls
# Given that we don't care about the FID column for this, we will recreate the filtered genotyped CoreExome files so that the FID and IID are matching, and all are unique
# length(unique(all_pheno$Geno.SampleID)) # all unique
unique_ids <- all_pheno %>% select(Geno.SampleID)
fam <- read_delim(here("Data/Genotypes/CZ-MB1.2-QC1.10_CoreExome24-1.0-3_genotyped-QCd_rsIDconverted.fam"), delim = " ", col_names = F)
fam_filtered <- fam %>%
filter(X2 %in% unique_ids$Geno.SampleID) %>%
select(X1, X2)
# write_delim(fam_filtered, delim = "\t", file = here("Output/Temp/unique_ids.txt"), col_names = F)
# system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Data/Genotypes/CZ-MB1.2-QC1.10_CoreExome24-1.0-3_genotyped-QCd_rsIDconverted"), ' --keep ', here("Output/Temp/unique_ids.txt"), ' --chr 1-22 --make-bed --out ', here("Data/Genotypes/Clean_CoreEx_Geno")))
all_pheno2 <- all_pheno %>%
select(Geno.SampleID, SEX) %>%
mutate(SEX = as.numeric(factor(SEX, levels = c("Male", "Female"))))
fam2 <- read_delim(here("Data/Genotypes/Clean_CoreEx_Geno.fam"), delim = " ", col_names = F)
fam_clean <- fam2 %>%
left_join(all_pheno2, by = c("X1" = "Geno.SampleID")) %>%
mutate(X1 = X2,
X3 = 0,
X4 = 0,
X5 = SEX) %>%
select(-SEX)
# write_delim(fam_clean, delim = " ", file = here("Data/Genotypes/Clean_CoreEx_Geno.fam"), col_names = F)
# system(paste0(here("king"), ' -b ', here("Data/Genotypes/Clean_CoreEx_Geno.bed"), ' --kinship')) # need to run this within the PRS directory rather than via this command
# relationships <- read_delim(here("king.kin0"), delim = "\t")
test <- relationships %>%
filter(Kinship > 0.177) # total of 517 pairwise relationships that are at least first degree relatives
test2 <- c(test$ID1, test$ID2) %>%
unique() # 854 individuals in the 517 pairs
all_pheno_related <- all_pheno %>%
filter(Geno.SampleID %in% test2) %>%
select(Geno.SampleID, GOUT)
related_pairs <- test %>%
select(ID1, ID2)
related_pairs2 <- related_pairs %>%
left_join(all_pheno_related, by = c("ID1" = "Geno.SampleID")) %>%
left_join(all_pheno_related, by = c("ID2" = "Geno.SampleID")) %>%
mutate(Pair = case_when(GOUT.x == "Gout" & GOUT.y == "Gout" ~ "GG",
GOUT.x == "Gout" & GOUT.y == "Control" ~ "GC",
GOUT.x == "Control" & GOUT.y == "Gout" ~ "CG",
GOUT.x == "Control" & GOUT.y == "Control" ~ "CC"))
test3 <- c(test$ID1, test$ID2)
test3_dup <- c(test$ID1, test$ID2) %>% duplicated()
test3_dup2 <- test3[test3_dup]
test3_dup3 <- test3[test3 %in% test3_dup2]
test3_dup4 <- test3[!(test3 %in% test3_dup2)]
related_pairs_unique <- related_pairs2 %>%
filter(ID1 %in% test3_dup4,
ID2 %in% test3_dup4) %>%
mutate(keep1 = Pair %in% c('GG', 'GC', 'CC'),
keep2 = !keep1)
remove1 <- c(related_pairs_unique %>% filter(keep2) %>% pull(ID1),
related_pairs_unique %>% filter(keep1) %>% pull(ID2))
related_pairs_multi <- related_pairs2 %>%
filter(!(ID1 %in% related_pairs_unique$ID1))
related_pairs_multi_gc <- related_pairs_multi %>%
filter(Pair %in% c("GC", "CG")) %>%
mutate(remove_id = case_when(Pair == "GC" ~ ID2,
Pair == "CG" ~ ID1))
remove2 <- related_pairs_multi_gc$remove_id %>% unique()
related_pairs_multi_cc <- related_pairs_multi %>%
filter(Pair == "CC",
!(ID1 %in% remove2 | ID2 %in% remove2))
remove3 <- related_pairs_multi_cc$ID1 %>% unique()
related_pairs_multi_gg <- related_pairs_multi %>%
filter(Pair == "GG")
remove4 <- related_pairs_multi_gg$ID1 %>% unique()
remove_list <- c(remove1, remove2, remove3, remove4)
all_pheno_dedup <- all_pheno %>%
filter(!(Geno.SampleID %in% remove_list))
save(all_pheno_dedup, ukbb_pheno, file = here("Output/Phenotypes_dedup.RData"))
The UKBB gout PRS and Tin Urate PRS’s were then calculated for all European and Polynesian samples. Using the results of the UKBB GWAS (only using genotyped variants), 21 PRS metrics were generated, this includes the complete PRS, the 19 variants individually, and the PRS without either ABCG2 variant. For the Tin GWAS this number was 84 (two PRS variants and 82 unique variants, some of which overlap with the 19 of the gout GWAS).
load(here("Output/Phenotypes_dedup.RData"))
load(here("Output/UKBB_Gene_OR.RData"))
load(here("Output/Tin_Gene_OR.RData"))
# Extracting variants from Tin that overlap UKBB
tin <- vroom("/Volumes/scratch/merrimanlab/Nick/PRS/tin_filtered.txt",
delim = "\t",
col_names = T)
tin2 <- tin %>% filter(RSID %in% UKBB_Gene_OR$RSID) %>% mutate(risk_allele = case_when(Effect > 0 ~ toupper(Allele1),
Effect < 0 ~ toupper(Allele2)))
sum(tin2$risk_allele == UKBB_Gene_OR$Effect_Allele) # all same effect direction, so can just take absolute of effect
different <- UKBB_Gene_OR %>% filter(!(RSID %in% Tin_Gene_OR$RSID)) %>% select(CHR:BP, BP1:Locus_Name) # 4 are the secondary hits at the same loci
different <- different %>% filter(!(Locus_Name %in% c("ABCG2", "SLC2A9", "SLC22A11")))
tmp_gene_or <- UKBB_Gene_OR %>%
mutate(Tin_Effect = abs(tin2$Effect)) %>%
filter(RSID %in% c(Tin_Gene_OR$RSID, different$RSID)) %>%
mutate(OR = case_when(is.na(OR_old) ~ OR,
TRUE ~ OR_old),
Gout_Effect = log(OR),
Effect_Ratio = Gout_Effect / Tin_Effect) %>%
select(CHR:BP, EAF, Locus_Name, Gout_Effect, Tin_Effect, Effect_Ratio)
# Load in plink genotype files + rename column names + filter to only include IDs of cohort of interest
map <- read_delim(here("Output/Temp/SNPs.map"),
delim = "\t",
col_names = FALSE)
x <- read_delim(here("Output/Temp/SNPs.ped"),
delim = " ",
col_names = FALSE,
col_types = cols(.default = col_character()))
colnames(x)[1:6] <- c("FID", "IID", "PID", "MID", "SEX", "AFF")
colnames(x)[seq(from = 7, to = ncol(x) - 1, by = 2)] <- str_c(map$X2, "_1")
colnames(x)[seq(from = 8, to = ncol(x), by = 2)] <- str_c(map$X2, "_2")
x <- x %>% filter(IID %in% (all_pheno_dedup$Geno.SampleID))
tmp <- UKBB_Gene_OR %>%
select(CHR, BP) %>%
rbind(select(Tin_Gene_OR, CHR, BP)) %>%
filter(duplicated(BP)) %>%
arrange(CHR, BP)
tmp1 <- UKBB_Gene_OR %>%
filter(BP %in% tmp$BP) %>%
select(Effect_Allele) %>%
cbind(Tin_Gene_OR %>% filter(BP %in% tmp$BP) %>% select(Effect_Allele)) # All risk alleles are the same
Combined_Gene_OR <- UKBB_Gene_OR %>%
select(CHR, BP, RSID, Effect_Allele, Alternate_Allele) %>%
rbind(Tin_Gene_OR %>% select(CHR, BP, RSID, Effect_Allele, Alternate_Allele)) %>%
arrange(CHR, BP) %>%
unique()
# Convert character genotypes into numeric genotypes based on risk allele = 1
num_cols <- ncol(x)
for (i in 1:nrow(Combined_Gene_OR)) {
x[[2 * i + 5]] <- x[[2 * i + 5]] %>%
str_replace("0", "NA") %>%
str_replace(Combined_Gene_OR[[i, "Effect_Allele"]], "1") %>%
str_replace(Combined_Gene_OR[[i, "Alternate_Allele"]], "0") %>%
as.numeric()
x[[2 * i + 6]] <- x[[2 * i + 6]] %>%
str_replace("0", "NA") %>%
str_replace(Combined_Gene_OR[[i, "Effect_Allele"]], "1") %>%
str_replace(Combined_Gene_OR[[i, "Alternate_Allele"]], "0") %>%
as.numeric()
x <- x %>%
mutate("TEMP" = (x[[2 * i + 5]] + x[[2 * i + 6]]))
colnames(x) <- c(colnames(x[1:((num_cols - 1) + i)]), Combined_Gene_OR[[i, "RSID"]])
}
x <- x %>%
select(2, (num_cols + 1):ncol(x))
# Save this dataframe for individual SNP analysis
x1 <- x
# Now to calculate the PRS
x <- x %>% select(1, UKBB_Gene_OR$RSID)
for(i in 1:nrow(UKBB_Gene_OR)) {
x[i + 1] <- x[[i + 1]] * log(UKBB_Gene_OR[[i, "OR"]])
}
x$PRS <- rowSums(x[2:(ncol(x))])
x <- x %>%
mutate(PRS_noABCG2 = PRS - rs2231142 - rs10011796) %>%
select(IID, PRS, PRS_noABCG2)
all_pheno_prs <- all_pheno_dedup %>%
left_join(x, by = c("Geno.SampleID" = "IID"))
x <- x1 %>% select(1, Tin_Gene_OR$RSID)
for(i in 1:nrow(Tin_Gene_OR)) {
x[i + 1] <- x[[i + 1]] * Tin_Gene_OR[[i, "Beta"]]
}
x$Urate_PRS <- rowSums(x[2:(ncol(x))])
x <- x %>%
mutate(Urate_PRS_noABCG2 = Urate_PRS - rs2231142) %>%
select(IID, Urate_PRS, Urate_PRS_noABCG2)
all_pheno_prs <- all_pheno_prs %>%
left_join(x, by = c("Geno.SampleID" = "IID")) %>%
left_join(x1, by = c("Geno.SampleID" = "IID"))
x <- x1 %>% select(1, tmp_gene_or$RSID)
for(i in 1:nrow(tmp_gene_or)) {
x[i + 1] <- x[[i + 1]] * tmp_gene_or[[i, "Gout_Effect"]]
}
x$PRS2 <- rowSums(x[2:(ncol(x))])
x <- x %>%
mutate(PRS2_noABCG2 = PRS2 - rs2231142) %>%
select(IID, PRS2, PRS2_noABCG2)
all_pheno_prs <- all_pheno_prs %>%
left_join(x, by = c("Geno.SampleID" = "IID"))
x <- x1 %>% select(1, tmp_gene_or$RSID)
for(i in 1:nrow(tmp_gene_or)) {
x[i + 1] <- x[[i + 1]] * tmp_gene_or[[i, "Tin_Effect"]]
}
x$Urate_PRS2 <- rowSums(x[2:(ncol(x))])
x <- x %>%
mutate(Urate_PRS2_noABCG2 = Urate_PRS2 - rs2231142) %>%
select(IID, Urate_PRS2, Urate_PRS2_noABCG2)
all_pheno_prs <- all_pheno_prs %>%
left_join(x, by = c("Geno.SampleID" = "IID"))
x <- x1 %>% select(1, tmp_gene_or$RSID)
x$Unweighted_PRS <- rowSums(x[2:(ncol(x))])
x <- x %>%
mutate(Unweighted_PRS_noABCG2 = Unweighted_PRS - rs2231142) %>%
select(IID, Unweighted_PRS, Unweighted_PRS_noABCG2)
all_pheno_prs <- all_pheno_prs %>%
left_join(x, by = c("Geno.SampleID" = "IID"))
# UK Biobank
map <- read_delim(here("Output/Temp/merged_PRS.map"),
delim = "\t",
col_names = FALSE) %>%
separate(X2, into = c("X2", NA), sep = ",")
x <- read_delim(here("Output/Temp/merged_PRS.ped"),
delim = " ",
col_names = FALSE,
col_types = cols(.default = col_character()))
colnames(x)[1:6] <- c("FID", "IID", "PID", "MID", "SEX", "AFF")
colnames(x)[seq(from = 7, to = ncol(x) - 1, by = 2)] <- str_c(map$X2, "_1")
colnames(x)[seq(from = 8, to = ncol(x), by = 2)] <- str_c(map$X2, "_2")
tmp <- c("FID", "IID", "PID", "MID", "SEX", "AFF", str_c(Combined_Gene_OR$RSID, "_1"), str_c(Combined_Gene_OR$RSID, "_2"))
ids <- ukbb_pheno$IID
x <- x %>%
select(which(colnames(x) %in% tmp)) %>%
mutate(IID = as.numeric(IID)) %>%
filter(IID %in% ids)
# Convert character genotypes into numeric genotypes based on risk allele = 1
num_cols <- ncol(x)
for(i in 1:nrow(Combined_Gene_OR)) {
x[[2 * i + 5]] <- x[[2 * i + 5]] %>%
str_replace("0", "NA") %>%
str_replace(Combined_Gene_OR[[i, "Effect_Allele"]], "1") %>%
str_replace(Combined_Gene_OR[[i, "Alternate_Allele"]], "0") %>%
as.numeric()
x[[2 * i + 6]] <- x[[2 * i + 6]] %>%
str_replace("0", "NA") %>%
str_replace(Combined_Gene_OR[[i, "Effect_Allele"]], "1") %>%
str_replace(Combined_Gene_OR[[i, "Alternate_Allele"]], "0") %>%
as.numeric()
x <- x %>%
mutate("TEMP" = (x[[2 * i + 5]] + x[[2 * i + 6]]))
colnames(x) <- c(colnames(x[1:(num_cols - 1 + i)]), Combined_Gene_OR[[i, "RSID"]])
}
x <- x %>%
select(2, (num_cols + 1):ncol(x))
# Save this dataframe for individual SNP analysis
x1 <- x
# Now to calculate the PRS
x <- x %>% select(1, UKBB_Gene_OR$RSID)
for(i in 1:nrow(UKBB_Gene_OR)) {
x[i + 1] <- x[[i + 1]] * log(UKBB_Gene_OR[[i, "OR"]])
}
x$PRS <- rowSums(x[2:(ncol(x))])
x <- x %>%
mutate(PRS_noABCG2 = PRS - rs2231142 - rs10011796) %>%
select(IID, PRS, PRS_noABCG2) %>%
mutate(IID = factor(IID))
all_pheno_prs2 <- ukbb_pheno %>%
mutate(IID = factor(IID)) %>%
left_join(x, by = "IID")
x <- x1 %>% select(1, Tin_Gene_OR$RSID)
for(i in 1:nrow(Tin_Gene_OR)) {
x[i + 1] <- x[[i + 1]] * Tin_Gene_OR[[i, "Beta"]]
}
x$Urate_PRS <- rowSums(x[2:(ncol(x))])
x <- x %>%
mutate(Urate_PRS_noABCG2 = Urate_PRS - rs2231142) %>%
select(IID, Urate_PRS, Urate_PRS_noABCG2) %>%
mutate(IID = factor(IID))
x1a <- x1 %>%
mutate(IID = factor(IID))
all_pheno_prs2 <- all_pheno_prs2 %>%
left_join(x, by = c("IID")) %>%
left_join(x1a)
x <- x1 %>% select(1, tmp_gene_or$RSID)
for(i in 1:nrow(tmp_gene_or)) {
x[i + 1] <- x[[i + 1]] * tmp_gene_or[[i, "Gout_Effect"]]
}
x$PRS2 <- rowSums(x[2:(ncol(x))])
x <- x %>%
mutate(PRS2_noABCG2 = PRS2 - rs2231142) %>%
select(IID, PRS2, PRS2_noABCG2) %>%
mutate(IID = factor(IID))
x1a <- x1 %>%
mutate(IID = factor(IID))
all_pheno_prs2 <- all_pheno_prs2 %>%
left_join(x, by = c("IID"))
x <- x1 %>% select(1, tmp_gene_or$RSID)
for(i in 1:nrow(tmp_gene_or)) {
x[i + 1] <- x[[i + 1]] * tmp_gene_or[[i, "Tin_Effect"]]
}
x$Urate_PRS2 <- rowSums(x[2:(ncol(x))])
x <- x %>%
mutate(Urate_PRS2_noABCG2 = Urate_PRS2 - rs2231142) %>%
select(IID, Urate_PRS2, Urate_PRS2_noABCG2) %>%
mutate(IID = factor(IID))
x1a <- x1 %>%
mutate(IID = factor(IID))
all_pheno_prs2 <- all_pheno_prs2 %>%
left_join(x, by = c("IID"))
x <- x1 %>% select(1, tmp_gene_or$RSID)
x$Unweighted_PRS <- rowSums(x[2:(ncol(x))])
x <- x %>%
mutate(Unweighted_PRS_noABCG2 = Unweighted_PRS - rs2231142) %>%
select(IID, Unweighted_PRS, Unweighted_PRS_noABCG2) %>%
mutate(IID = factor(IID))
x1a <- x1 %>%
mutate(IID = factor(IID))
all_pheno_prs2 <- all_pheno_prs2 %>%
left_join(x, by = c("IID"))
all_pheno_prs <- all_pheno_prs %>%
mutate(GOUT = as.logical(case_when(GOUT == "Gout" ~ TRUE,
GOUT == "Control" ~ FALSE))) %>%
full_join(all_pheno_prs2)
save(all_pheno_prs, file = here("Output/all_pheno_prs.RData"))
rm(all_pheno_prs2, map, x, x1, x1a, i, ids, num_cols, tmp, ukbb_pheno, tmp1, all_pheno_dedup, different, tin, tin2)
Data Exploration
The purpose of this section is to explore the data that was generated in the previous section.
After loading in the data, I first produced a table of missing data for all of my variables of interest. I used this as evidence for filtering individuals. After filtering the data, and removing certain phenotypes that had too much missing data, I then re-observed the missingness and reported the proportion for each variable.
Next, I produced a table that shows the distribution of all variables of interest in all cohorts. I then plotted each of these distributions in all cohorts. Next, I plotted the relationship between several combinations of variables. Finally, I plotted the minor allele frequency of all SNPs in each cohort.
load(here("Output/UKBB_Gene_OR.RData"))
load(here("Output/all_pheno_prs.RData"))
# Making FLARE_CAT variable and setting all control gout severity traits to NA and removing any non-Europeans with an imputed PRS
all_pheno_prs <- all_pheno_prs %>%
mutate(FLARE_CAT = factor(case_when(between(NUMATK, 0, 5) ~ paste0(as.character(NUMATK), " flares in last year"),
between(NUMATK, 6, 11) ~ "One every one to two months",
between(NUMATK, 12, 52) ~ "One or more per month"),
levels = c(paste0(0:5, " flares in last year"),
"One every one to two months",
"One or more per month"),
labels = c(paste0(0:5),
"6 - 11",
"12 - 52"),
ordered = TRUE),
AGE1ATK = case_when(GOUT ~ AGE1ATK),
DURATION = case_when(GOUT ~ DURATION),
NUMATK = case_when(GOUT ~ NUMATK),
TOPHIGOUT = case_when(GOUT ~ TOPHIGOUT),
ULT = case_when(GOUT ~ ULT))
all_pheno_prs_male <- all_pheno_prs %>%
filter(SEX == "Male")
all_pheno_prs_female <- all_pheno_prs %>%
filter(SEX == "Female")
cohortstring <- c("UK Biobank - Gout - Male",
"UK Biobank - Gout - Female",
"UK Biobank - Control - Male",
"UK Biobank - Control - Female",
"Aus/NZ European - Gout - Male",
"Aus/NZ European - Gout - Female",
"Aus/NZ European - Control - Male",
"Aus/NZ European - Control - Female",
"GlobalGout - Gout - Male",
"GlobalGout - Gout - Female",
"GlobalGout - Control - Male",
"GlobalGout - Control - Female",
"Ardea - LASSO - Male",
"Ardea - LASSO - Female",
"Ardea - CLEAR1 - Male",
"Ardea - CLEAR1 - Female",
"Ardea - CLEAR2 - Male",
"Ardea - CLEAR2 - Female",
"Ardea - CRYSTAL - Male",
"Ardea - CRYSTAL - Female",
"Ardea - LIGHT - Male",
"Ardea - LIGHT - Female",
"East Polynesian - Gout - Male",
"East Polynesian - Gout - Female",
"East Polynesian - Control - Male",
"East Polynesian - Control - Female",
"East Polynesian - Gout - Male - NP",
"East Polynesian - Gout - Female - NP",
"East Polynesian - Control - Male - NP",
"East Polynesian - Control - Female - NP",
"West Polynesian - Gout - Male",
"West Polynesian - Gout - Female",
"West Polynesian - Control - Male",
"West Polynesian - Control - Female")
data_list <- list(all_pheno_prs_male %>% filter(GOUT,
Pheno.Study == "UK Biobank"),
all_pheno_prs_female %>% filter(GOUT,
Pheno.Study == "UK Biobank"),
all_pheno_prs_male %>% filter(!GOUT,
Pheno.Study == "UK Biobank"),
all_pheno_prs_female %>% filter(!GOUT,
Pheno.Study == "UK Biobank"),
all_pheno_prs_male %>% filter(GOUT,
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
all_pheno_prs_female %>% filter(GOUT,
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
all_pheno_prs_male %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
all_pheno_prs_female %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
all_pheno_prs_male %>% filter(GOUT,
Pheno.Study == "EuroGout",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs_female %>% filter(GOUT,
Pheno.Study == "EuroGout",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs_male %>% filter(!GOUT,
Pheno.Study == "EuroGout",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs_female %>% filter(!GOUT,
Pheno.Study == "EuroGout",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: 401",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: 401",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: CLEAR1",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: CLEAR1",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: CLEAR2",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: CLEAR2",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: CRYSTAL",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: CRYSTAL",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: LIGHT",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: LIGHT",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs_male %>% filter(GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study != "Ngati Porou"),
all_pheno_prs_female %>% filter(GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study != "Ngati Porou"),
all_pheno_prs_male %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study != "Ngati Porou"),
all_pheno_prs_female %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study != "Ngati Porou"),
all_pheno_prs_male %>% filter(GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study == "Ngati Porou"),
all_pheno_prs_female %>% filter(GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study == "Ngati Porou"),
all_pheno_prs_male %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study == "Ngati Porou"),
all_pheno_prs_female %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study == "Ngati Porou"),
all_pheno_prs_male %>% filter(GOUT,
Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
all_pheno_prs_female %>% filter(GOUT,
Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
all_pheno_prs_male %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
all_pheno_prs_female %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")))
clean_names <- c("UK Biobank<br/>Gout<br/>Male",
"UK Biobank<br/>Gout<br/>Female",
"UK Biobank<br/>Control<br/>Male",
"UK Biobank<br/>Control<br/>Female",
"Aus/NZ European<br/>Gout<br/>Male",
"Aus/NZ European<br/>Gout<br/>Female",
"Aus/NZ European<br/>Control<br/>Male",
"Aus/NZ European<br/>Control<br/>Female",
"GlobalGout<br/>Gout<br/>Male",
"GlobalGout<br/>Gout<br/>Female",
"GlobalGout<br/>Control<br/>Male",
"GlobalGout<br/>Control<br/>Female",
"Ardea<br/>LASSO<br/>Gout<br/>Male",
"Ardea<br/>LASSO<br/>Gout<br/>Female",
"Ardea<br/>CLEAR1<br/>Gout<br/>Male",
"Ardea<br/>CLEAR1<br/>Gout<br/>Female",
"Ardea<br/>CLEAR2<br/>Gout<br/>Male",
"Ardea<br/>CLEAR2<br/>Gout<br/>Female",
"Ardea<br/>CRYSTAL<br/>Gout<br/>Male",
"Ardea<br/>CRYSTAL<br/>Gout<br/>Female",
"Ardea<br/>LIGHT<br/>Gout<br/>Male",
"Ardea<br/>LIGHT<br/>Gout<br/>Female",
"East Polynesian<br/>Gout<br/>Male",
"East Polynesian<br/>Gout<br/>Female",
"East Polynesian<br/>Control<br/>Male",
"East Polynesian<br/>Control<br/>Female",
"East Polynesian<br/>Gout<br/>Male<br/>NP",
"East Polynesian<br/>Gout<br/>Female<br/>NP",
"East Polynesian<br/>Control<br/>Male<br/>NP",
"East Polynesian<br/>Control<br/>Female<br/>NP",
"West Polynesian<br/>Gout<br/>Male",
"West Polynesian<br/>Gout<br/>Female",
"West Polynesian<br/>Control<br/>Male",
"West Polynesian<br/>Control<br/>Female")
# Functions
report <- function(x) {
if(sum(is.na(x)) != length(x)) {
paste0(sprintf(mean(x, na.rm = TRUE), fmt = "%#.1f"), " ± ", sprintf(sd(x, na.rm = TRUE), fmt = "%#.1f"))
} else {
paste0("NA")
}
}
report_median <- function(x) {
if(sum(is.na(x)) != length(x)) {
paste0(median(x, na.rm =T), " (", summary(x)[[2]], " - ", summary(x)[[5]], ")")
} else {
paste0("NA")
}
}
sumreport <- function(x) {
if(sum(is.na(x)) != length(x)){
paste0(sum(x, na.rm = TRUE), " (", sprintf((mean(x, na.rm = TRUE) * 100), fmt = "%#.1f"), ")")
} else {
paste0("NA")
}
}
transpose_df <- function(df) {
t_df <- data.table::transpose(df)
colnames(t_df) <- rownames(df)
rownames(t_df) <- colnames(df)
t_df <- t_df %>%
rownames_to_column() %>%
as_tibble() %>%
row_to_names(row_number = 1)
return(t_df)
}
missing <- function(x){
if(sum(is.na(x)) == length(x)) {
return("All")
} else if(sum(!is.na(x)) == length(x)){
return("None")
} else {
paste0(format(sum(is.na(x)), big.mark = ","), " (", format(round((sum(is.na(x)) / length(x) * 100), digits = 1), nsmall = 1), ")")
}
}
Table of Missing Data Percentages
table1 <- tibble("Cohort" = cohortstring,
"N" = unlist(lapply(data_list, function(x) nrow(x))),
"Age at Collection" = unlist(lapply(data_list, function(x) missing(x$AGECOL))),
"Serum Urate" = unlist(lapply(data_list, function(x) missing(x$URATE))),
"ULT" = unlist(lapply(data_list, function(x) missing(x$ULT))),
"Age at Onset" = unlist(lapply(data_list, function(x) missing(x$AGE1ATK))),
"Disease Duration" = unlist(lapply(data_list, function(x) missing(x$DURATION))),
"Flares" = unlist(lapply(data_list, function(x) missing(x$NUMATK))),
"Tophi" = unlist(lapply(data_list, function(x) missing(x$TOPHIGOUT))),
"PRS" = unlist(lapply(data_list, function(x) missing(x$PRS))),
"Urate PRS" = unlist(lapply(data_list, function(x) missing(x$Urate_PRS))),
"Prophylaxis" = unlist(lapply(data_list, function(x) missing(x$PROPHY))),
"BMI" = unlist(lapply(data_list, function(x) missing(x$BMI))),
"Hypertension" = unlist(lapply(data_list, function(x) missing(x$HYPERTENSION))),
"Type 2 Diabetes" = unlist(lapply(data_list, function(x) missing(x$DIABETES))),
"Heart Disease" = unlist(lapply(data_list, function(x) missing(x$HEART))),
"Kidney Disease" = unlist(lapply(data_list, function(x) missing(x$KIDNEY))),
"Dyslipidemia" = unlist(lapply(data_list, function(x) missing(x$LIPIDS))),
"Stroke" = unlist(lapply(data_list, function(x) missing(x$STROKE))),
"Alcoholic Drinks / Week" = unlist(lapply(data_list, function(x) missing(x$TOTALALC))),
"Sugar-Sweetened Drinks / Week" = unlist(lapply(data_list, function(x) missing(x$SUGDRINK))),
"Current Smoker" = unlist(lapply(data_list, function(x) missing(x$CURSMOKE))),
"Family History of Gout" = unlist(lapply(data_list, function(x) missing(x$FAMGOUT))),
"No. Relatives w/ Gout" = unlist(lapply(data_list, function(x) missing(x$FAMGOUTNUM))))
table1 <- transpose_df(table1) %>%
column_to_rownames(var = "Cohort") %>%
mutate(across(.cols = 1:ncol(table1), ~ str_replace_all(string = .x, pattern = " ", replacement = " ")))
row.names(table1) <- str_replace_all(row.names(table1), " ", " ")
table1 %>%
kable(col.names = clean_names,
align = "c",
escape = F) %>%
kable_styling("striped") %>%
scroll_box(width = "900px", height = "475px") %>%
footnote("'All' = all missing, 'None' = none missing")
|
|
UK Biobank Gout Male
|
UK Biobank Gout Female
|
UK Biobank Control Male
|
UK Biobank Control Female
|
Aus/NZ European Gout Male
|
Aus/NZ European Gout Female
|
Aus/NZ European Control Male
|
Aus/NZ European Control Female
|
GlobalGout Gout Male
|
GlobalGout Gout Female
|
GlobalGout Control Male
|
GlobalGout Control Female
|
Ardea LASSO Gout Male
|
Ardea LASSO Gout Female
|
Ardea CLEAR1 Gout Male
|
Ardea CLEAR1 Gout Female
|
Ardea CLEAR2 Gout Male
|
Ardea CLEAR2 Gout Female
|
Ardea CRYSTAL Gout Male
|
Ardea CRYSTAL Gout Female
|
Ardea LIGHT Gout Male
|
Ardea LIGHT Gout Female
|
East Polynesian Gout Male
|
East Polynesian Gout Female
|
East Polynesian Control Male
|
East Polynesian Control Female
|
East Polynesian Gout Male NP
|
East Polynesian Gout Female NP
|
East Polynesian Control Male NP
|
East Polynesian Control Female NP
|
West Polynesian Gout Male
|
West Polynesian Gout Female
|
West Polynesian Control Male
|
West Polynesian Control Female
|
|
N
|
8394
|
995
|
177998
|
214876
|
1040
|
227
|
769
|
623
|
1627
|
221
|
45
|
84
|
826
|
66
|
231
|
16
|
241
|
7
|
178
|
4
|
110
|
8
|
421
|
133
|
243
|
385
|
128
|
29
|
44
|
37
|
457
|
62
|
203
|
184
|
|
Age at Collection
|
None
|
None
|
None
|
None
|
1 (0.1)
|
None
|
None
|
None
|
149 (9.2)
|
18 (8.1)
|
1 (2.2)
|
5 (6.0)
|
5 (0.6)
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
1 (0.5)
|
|
Serum Urate
|
393 (4.7)
|
59 (5.9)
|
8,374 (4.7)
|
10,379 (4.8)
|
34 (3.3)
|
8 (3.5)
|
36 (4.7)
|
25 (4.0)
|
309 (19.0)
|
53 (24.0)
|
33 (73.3)
|
76 (90.5)
|
4 (0.5)
|
1 (1.5)
|
1 (0.4)
|
None
|
1 (0.4)
|
None
|
None
|
None
|
None
|
None
|
4 (1.0)
|
1 (0.8)
|
35 (14.4)
|
65 (16.9)
|
1 (0.8)
|
1 (3.4)
|
None
|
None
|
2 (0.4)
|
2 (3.2)
|
22 (10.8)
|
17 (9.2)
|
|
ULT
|
None
|
None
|
All
|
All
|
453 (43.6)
|
114 (50.2)
|
All
|
All
|
706 (43.4)
|
108 (48.9)
|
All
|
All
|
5 (0.6)
|
1 (1.5)
|
None
|
None
|
None
|
None
|
81 (45.5)
|
3 (75.0)
|
None
|
None
|
136 (32.3)
|
39 (29.3)
|
All
|
All
|
23 (18.0)
|
2 (6.9)
|
All
|
All
|
134 (29.3)
|
19 (30.6)
|
All
|
All
|
|
Age at Onset
|
All
|
All
|
All
|
All
|
105 (10.1)
|
41 (18.1)
|
All
|
All
|
607 (37.3)
|
99 (44.8)
|
All
|
All
|
5 (0.6)
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
40 (9.5)
|
18 (13.5)
|
All
|
All
|
3 (2.3)
|
1 (3.4)
|
All
|
All
|
33 (7.2)
|
16 (25.8)
|
All
|
All
|
|
Disease Duration
|
All
|
All
|
All
|
All
|
105 (10.1)
|
41 (18.1)
|
All
|
All
|
608 (37.4)
|
99 (44.8)
|
All
|
All
|
5 (0.6)
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
40 (9.5)
|
18 (13.5)
|
All
|
All
|
3 (2.3)
|
1 (3.4)
|
All
|
All
|
33 (7.2)
|
16 (25.8)
|
All
|
All
|
|
Flares
|
All
|
All
|
All
|
All
|
173 (16.6)
|
51 (22.5)
|
All
|
All
|
654 (40.2)
|
99 (44.8)
|
All
|
All
|
5 (0.6)
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
45 (10.7)
|
23 (17.3)
|
All
|
All
|
5 (3.9)
|
3 (10.3)
|
All
|
All
|
36 (7.9)
|
16 (25.8)
|
All
|
All
|
|
Tophi
|
All
|
All
|
All
|
All
|
263 (25.3)
|
58 (25.6)
|
All
|
All
|
1,063 (65.3)
|
146 (66.1)
|
All
|
All
|
5 (0.6)
|
None
|
2 (0.9)
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
72 (17.1)
|
39 (29.3)
|
All
|
All
|
52 (40.6)
|
8 (27.6)
|
All
|
All
|
55 (12.0)
|
12 (19.4)
|
All
|
All
|
|
PRS
|
464 (5.5)
|
50 (5.0)
|
10,749 (6.0)
|
12,776 (5.9)
|
9 (0.9)
|
2 (0.9)
|
5 (0.7)
|
4 (0.6)
|
127 (7.8)
|
19 (8.6)
|
None
|
None
|
2 (0.2)
|
1 (1.5)
|
1 (0.4)
|
None
|
2 (0.8)
|
None
|
3 (1.7)
|
None
|
1 (0.9)
|
None
|
None
|
2 (1.5)
|
2 (0.8)
|
3 (0.8)
|
2 (1.6)
|
None
|
None
|
None
|
6 (1.3)
|
1 (1.6)
|
4 (2.0)
|
3 (1.6)
|
|
Urate PRS
|
2,879 (34.3)
|
325 (32.7)
|
60,408 (33.9)
|
73,101 (34.0)
|
55 (5.3)
|
9 (4.0)
|
23 (3.0)
|
93 (14.9)
|
187 (11.5)
|
27 (12.2)
|
1 (2.2)
|
None
|
30 (3.6)
|
3 (4.5)
|
5 (2.2)
|
None
|
15 (6.2)
|
None
|
6 (3.4)
|
None
|
4 (3.6)
|
None
|
14 (3.3)
|
6 (4.5)
|
10 (4.1)
|
26 (6.8)
|
10 (7.8)
|
2 (6.9)
|
None
|
3 (8.1)
|
21 (4.6)
|
3 (4.8)
|
11 (5.4)
|
12 (6.5)
|
|
Prophylaxis
|
All
|
All
|
All
|
All
|
971 (93.4)
|
208 (91.6)
|
768 (99.9)
|
All
|
783 (48.1)
|
125 (56.6)
|
All
|
All
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
361 (85.7)
|
118 (88.7)
|
242 (99.6)
|
All
|
2 (1.6)
|
3 (10.3)
|
43 (97.7)
|
All
|
405 (88.6)
|
53 (85.5)
|
All
|
183 (99.5)
|
|
BMI
|
31 (0.4)
|
8 (0.8)
|
621 (0.3)
|
641 (0.3)
|
79 (7.6)
|
22 (9.7)
|
175 (22.8)
|
53 (8.5)
|
502 (30.9)
|
85 (38.5)
|
All
|
All
|
10 (1.2)
|
None
|
1 (0.4)
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
9 (2.1)
|
5 (3.8)
|
3 (1.2)
|
13 (3.4)
|
3 (2.3)
|
None
|
4 (9.1)
|
6 (16.2)
|
13 (2.8)
|
5 (8.1)
|
3 (1.5)
|
9 (4.9)
|
|
Hypertension
|
None
|
None
|
None
|
None
|
408 (39.2)
|
39 (17.2)
|
489 (63.6)
|
324 (52.0)
|
855 (52.6)
|
120 (54.3)
|
All
|
All
|
None
|
None
|
1 (0.4)
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
128 (30.4)
|
18 (13.5)
|
172 (70.8)
|
253 (65.7)
|
32 (25.0)
|
3 (10.3)
|
32 (72.7)
|
28 (75.7)
|
205 (44.9)
|
15 (24.2)
|
161 (79.3)
|
139 (75.5)
|
|
Type 2 Diabetes
|
847 (10.1)
|
100 (10.1)
|
19,430 (10.9)
|
27,578 (12.8)
|
106 (10.2)
|
25 (11.0)
|
314 (40.8)
|
268 (43.0)
|
646 (39.7)
|
113 (51.1)
|
All
|
All
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
12 (2.9)
|
1 (0.8)
|
9 (3.7)
|
18 (4.7)
|
90 (70.3)
|
19 (65.5)
|
37 (84.1)
|
36 (97.3)
|
18 (3.9)
|
2 (3.2)
|
6 (3.0)
|
9 (4.9)
|
|
Heart Disease
|
None
|
None
|
None
|
None
|
235 (22.6)
|
35 (15.4)
|
381 (49.5)
|
294 (47.2)
|
1,093 (67.2)
|
142 (64.3)
|
All
|
All
|
None
|
None
|
1 (0.4)
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
46 (10.9)
|
11 (8.3)
|
28 (11.5)
|
61 (15.8)
|
48 (37.5)
|
7 (24.1)
|
29 (65.9)
|
24 (64.9)
|
39 (8.5)
|
5 (8.1)
|
48 (23.6)
|
14 (7.6)
|
|
Kidney Disease
|
379 (4.5)
|
48 (4.8)
|
8,161 (4.6)
|
10,107 (4.7)
|
245 (23.6)
|
47 (20.7)
|
385 (50.1)
|
263 (42.2)
|
1,083 (66.6)
|
134 (60.6)
|
All
|
All
|
10 (1.2)
|
None
|
2 (0.9)
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
90 (21.4)
|
27 (20.3)
|
193 (79.4)
|
311 (80.8)
|
98 (76.6)
|
17 (58.6)
|
38 (86.4)
|
29 (78.4)
|
82 (17.9)
|
11 (17.7)
|
163 (80.3)
|
145 (78.8)
|
|
Dyslipidemia
|
None
|
None
|
None
|
None
|
404 (38.8)
|
73 (32.2)
|
439 (57.1)
|
215 (34.5)
|
719 (44.2)
|
121 (54.8)
|
All
|
All
|
None
|
None
|
None
|
None
|
None
|
None
|
1 (0.6)
|
None
|
None
|
None
|
147 (34.9)
|
44 (33.1)
|
172 (70.8)
|
272 (70.6)
|
56 (43.8)
|
7 (24.1)
|
28 (63.6)
|
29 (78.4)
|
159 (34.8)
|
16 (25.8)
|
148 (72.9)
|
117 (63.6)
|
|
Stroke
|
None
|
None
|
None
|
None
|
320 (30.8)
|
65 (28.6)
|
268 (34.9)
|
90 (14.4)
|
1,089 (66.9)
|
166 (75.1)
|
All
|
All
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
65 (15.4)
|
18 (13.5)
|
32 (13.2)
|
50 (13.0)
|
68 (53.1)
|
11 (37.9)
|
31 (70.5)
|
26 (70.3)
|
53 (11.6)
|
7 (11.3)
|
46 (22.7)
|
15 (8.2)
|
|
Alcoholic Drinks / Week
|
1,173 (14.0)
|
472 (47.4)
|
36,709 (20.6)
|
75,184 (35.0)
|
7 (0.7)
|
1 (0.4)
|
124 (16.1)
|
204 (32.7)
|
1,079 (66.3)
|
150 (67.9)
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
34 (8.1)
|
3 (2.3)
|
2 (0.8)
|
7 (1.8)
|
None
|
None
|
None
|
None
|
36 (7.9)
|
2 (3.2)
|
3 (1.5)
|
1 (0.5)
|
|
Sugar-Sweetened Drinks / Week
|
All
|
All
|
All
|
All
|
126 (12.1)
|
31 (13.7)
|
173 (22.5)
|
51 (8.2)
|
1,352 (83.1)
|
190 (86.0)
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
76 (18.1)
|
17 (12.8)
|
6 (2.5)
|
10 (2.6)
|
2 (1.6)
|
None
|
None
|
None
|
91 (19.9)
|
5 (8.1)
|
4 (2.0)
|
2 (1.1)
|
|
Current Smoker
|
None
|
None
|
None
|
None
|
486 (46.7)
|
128 (56.4)
|
263 (34.2)
|
143 (23.0)
|
1,030 (63.3)
|
135 (61.1)
|
All
|
All
|
All
|
All
|
2 (0.9)
|
None
|
None
|
None
|
None
|
None
|
3 (2.7)
|
None
|
216 (51.3)
|
72 (54.1)
|
100 (41.2)
|
191 (49.6)
|
59 (46.1)
|
9 (31.0)
|
29 (65.9)
|
25 (67.6)
|
319 (69.8)
|
22 (35.5)
|
95 (46.8)
|
71 (38.6)
|
|
Family History of Gout
|
All
|
All
|
All
|
All
|
99 (9.5)
|
26 (11.5)
|
410 (53.3)
|
300 (48.2)
|
703 (43.2)
|
101 (45.7)
|
36 (80.0)
|
76 (90.5)
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
61 (14.5)
|
16 (12.0)
|
42 (17.3)
|
59 (15.3)
|
10 (7.8)
|
2 (6.9)
|
3 (6.8)
|
4 (10.8)
|
58 (12.7)
|
3 (4.8)
|
26 (12.8)
|
24 (13.0)
|
|
No. Relatives w/ Gout
|
All
|
All
|
All
|
All
|
327 (31.4)
|
81 (35.7)
|
535 (69.6)
|
462 (74.2)
|
1,229 (75.5)
|
179 (81.0)
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
111 (26.4)
|
33 (24.8)
|
66 (27.2)
|
89 (23.1)
|
38 (29.7)
|
9 (31.0)
|
17 (38.6)
|
10 (27.0)
|
94 (20.6)
|
18 (29.0)
|
45 (22.2)
|
49 (26.6)
|
|
Note:
|
|
‘All’ = all missing, ‘None’ = none missing
|
#datatable(table1, extension = "Responsive")
From this table we can tell the following:
Age at collection is only missing in GlobalGout, LASSO and West Polynesian Controls, and thus anybody missing this variable should be removed
Serum urate has less than 5% missingness in all European cohorts except for GlobalGout (with 22%), Polynesian controls also have approximately 10% missingness
ULT data seems to be missing for around 50% of people in Gout in Aotearoa, GlobalGout, and the CRYSTAL trial but is present for all other cohorts
Age at onset (and disease duration) are missing at around 10% or less for all cohorts except GlobalGout, which has over 40% missingness for this variable
Flare data is missing at up to 20% for most cohorts, but around 43% missing in GlobalGout
Tophi data is not missing in Ardea, around 10 - 25% missing in Gout in Aotearoa, and 67% missing in GlobalGout
The PRS is missing in up to 8% of individuals (mainly GlobalGout and UK Biobank)
Prophylaxis, comorbidity, lifestyle, and family history data are inconsistently phenotyped
Therefore, I will take the following actions:
I will remove anyone missing age at collection
I will remove anyone missing all three severity traits (excluding controls and UK Biobank)
I will remove anyone missing the PRS
all_pheno_prs2 <- all_pheno_prs %>%
filter(!is.na(AGECOL),
!is.na(PRS),
(Pheno.Study == "UK Biobank" | !GOUT | GOUT & !(is.na(AGE1ATK) & is.na(NUMATK) & is.na(TOPHIGOUT))))
all_pheno_prs2_male <- all_pheno_prs2 %>%
filter(SEX == "Male")
all_pheno_prs2_female <- all_pheno_prs2 %>%
filter(SEX == "Female")
data_list2 <- list(all_pheno_prs2_male %>% filter(GOUT,
Pheno.Study == "UK Biobank"),
all_pheno_prs2_female %>% filter(GOUT,
Pheno.Study == "UK Biobank"),
all_pheno_prs2_male %>% filter(!GOUT,
Pheno.Study == "UK Biobank"),
all_pheno_prs2_female %>% filter(!GOUT,
Pheno.Study == "UK Biobank"),
all_pheno_prs2_male %>% filter(GOUT,
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
all_pheno_prs2_female %>% filter(GOUT,
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
all_pheno_prs2_male %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
all_pheno_prs2_female %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
all_pheno_prs2_male %>% filter(GOUT,
Pheno.Study == "EuroGout",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs2_female %>% filter(GOUT,
Pheno.Study == "EuroGout",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs2_male %>% filter(!GOUT,
Pheno.Study == "EuroGout",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs2_female %>% filter(!GOUT,
Pheno.Study == "EuroGout",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs2_male %>% filter(Pheno.Study == "Ardea: 401",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs2_female %>% filter(Pheno.Study == "Ardea: 401",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs2_male %>% filter(Pheno.Study == "Ardea: CLEAR1",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs2_female %>% filter(Pheno.Study == "Ardea: CLEAR1",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs2_male %>% filter(Pheno.Study == "Ardea: CLEAR2",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs2_female %>% filter(Pheno.Study == "Ardea: CLEAR2",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs2_male %>% filter(Pheno.Study == "Ardea: CRYSTAL",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs2_female %>% filter(Pheno.Study == "Ardea: CRYSTAL",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs2_male %>% filter(Pheno.Study == "Ardea: LIGHT",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs2_female %>% filter(Pheno.Study == "Ardea: LIGHT",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
all_pheno_prs2_male %>% filter(GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study != "Ngati Porou"),
all_pheno_prs2_female %>% filter(GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study != "Ngati Porou"),
all_pheno_prs2_male %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study != "Ngati Porou"),
all_pheno_prs2_female %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study != "Ngati Porou"),
all_pheno_prs2_male %>% filter(GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study == "Ngati Porou"),
all_pheno_prs2_female %>% filter(GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study == "Ngati Porou"),
all_pheno_prs2_male %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study == "Ngati Porou"),
all_pheno_prs2_female %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study == "Ngati Porou"),
all_pheno_prs2_male %>% filter(GOUT,
Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
all_pheno_prs2_female %>% filter(GOUT,
Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
all_pheno_prs2_male %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
all_pheno_prs2_female %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")))
table1 <- tibble("Cohort" = cohortstring,
"N" = unlist(lapply(data_list2, function(x) nrow(x))),
"Age at Collection" = unlist(lapply(data_list2, function(x) missing(x$AGECOL))),
"Serum Urate" = unlist(lapply(data_list2, function(x) missing(x$URATE))),
"ULT" = unlist(lapply(data_list2, function(x) missing(x$ULT))),
"Age at Onset" = unlist(lapply(data_list2, function(x) missing(x$AGE1ATK))),
"Disease Duration" = unlist(lapply(data_list2, function(x) missing(x$DURATION))),
"Flares" = unlist(lapply(data_list2, function(x) missing(x$NUMATK))),
"Tophi" = unlist(lapply(data_list2, function(x) missing(x$TOPHIGOUT))),
"PRS" = unlist(lapply(data_list2, function(x) missing(x$PRS))),
"Prophylaxis" = unlist(lapply(data_list2, function(x) missing(x$PROPHY))),
"BMI" = unlist(lapply(data_list2, function(x) missing(x$BMI))),
"Hypertension" = unlist(lapply(data_list2, function(x) missing(x$HYPERTENSION))),
"Type 2 Diabetes" = unlist(lapply(data_list2, function(x) missing(x$DIABETES))),
"Heart Disease" = unlist(lapply(data_list2, function(x) missing(x$HEART))),
"Kidney Disease" = unlist(lapply(data_list2, function(x) missing(x$KIDNEY))),
"Dyslipidemia" = unlist(lapply(data_list2, function(x) missing(x$LIPIDS))),
"Stroke" = unlist(lapply(data_list2, function(x) missing(x$STROKE))),
"Alcoholic Drinks / Week" = unlist(lapply(data_list2, function(x) missing(x$TOTALALC))),
"Sugar-Sweetened Drinks / Week" = unlist(lapply(data_list2, function(x) missing(x$SUGDRINK))),
"Current Smoker" = unlist(lapply(data_list2, function(x) missing(x$CURSMOKE))),
"Family History of Gout" = unlist(lapply(data_list2, function(x) missing(x$FAMGOUT))),
"No. Relatives w/ Gout" = unlist(lapply(data_list2, function(x) missing(x$FAMGOUTNUM))))
table1 <- transpose_df(table1) %>%
column_to_rownames(var = "Cohort") %>%
mutate(across(.cols = 1:ncol(table1), ~ str_replace(string = .x, pattern = " ", replacement = " ")))
row.names(table1) <- str_replace(row.names(table1), " ", " ")
table1 %>%
kable(col.names = clean_names,
align = "c",
escape = F) %>%
kable_styling("striped") %>%
scroll_box(width = "900px", height = "475px") %>%
footnote("'All' = all missing, 'None' = none missing")
|
|
UK Biobank Gout Male
|
UK Biobank Gout Female
|
UK Biobank Control Male
|
UK Biobank Control Female
|
Aus/NZ European Gout Male
|
Aus/NZ European Gout Female
|
Aus/NZ European Control Male
|
Aus/NZ European Control Female
|
GlobalGout Gout Male
|
GlobalGout Gout Female
|
GlobalGout Control Male
|
GlobalGout Control Female
|
Ardea LASSO Gout Male
|
Ardea LASSO Gout Female
|
Ardea CLEAR1 Gout Male
|
Ardea CLEAR1 Gout Female
|
Ardea CLEAR2 Gout Male
|
Ardea CLEAR2 Gout Female
|
Ardea CRYSTAL Gout Male
|
Ardea CRYSTAL Gout Female
|
Ardea LIGHT Gout Male
|
Ardea LIGHT Gout Female
|
East Polynesian Gout Male
|
East Polynesian Gout Female
|
East Polynesian Control Male
|
East Polynesian Control Female
|
East Polynesian Gout Male NP
|
East Polynesian Gout Female NP
|
East Polynesian Control Male NP
|
East Polynesian Control Female NP
|
West Polynesian Gout Male
|
West Polynesian Gout Female
|
West Polynesian Control Male
|
West Polynesian Control Female
|
|
N
|
7930
|
945
|
167249
|
202100
|
978
|
210
|
764
|
619
|
1032
|
124
|
44
|
79
|
819
|
65
|
230
|
16
|
239
|
7
|
175
|
4
|
109
|
8
|
408
|
122
|
241
|
382
|
124
|
28
|
44
|
37
|
436
|
54
|
199
|
180
|
|
Age at Collection
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
|
Serum Urate
|
372 (4.7)
|
58 (6.1)
|
7,828 (4.7)
|
9,738 (4.8)
|
18 (1.8)
|
4 (1.9)
|
36 (4.7)
|
24 (3.9)
|
50 (4.8)
|
8 (6.5)
|
32 (72.7)
|
71 (89.9)
|
4 (0.5)
|
1 (1.5)
|
1 (0.4)
|
None
|
1 (0.4)
|
None
|
None
|
None
|
None
|
None
|
2 (0.5)
|
1 (0.8)
|
35 (14.5)
|
64 (16.8)
|
None
|
1 (3.6)
|
None
|
None
|
1 (0.2)
|
1 (1.9)
|
21 (10.6)
|
16 (8.9)
|
|
ULT
|
None
|
None
|
All
|
All
|
412 (42.1)
|
100 (47.6)
|
All
|
All
|
279 (27.0)
|
42 (33.9)
|
All
|
All
|
3 (0.4)
|
1 (1.5)
|
None
|
None
|
None
|
None
|
81 (46.3)
|
3 (75.0)
|
None
|
None
|
125 (30.6)
|
33 (27.0)
|
All
|
All
|
21 (16.9)
|
2 (7.1)
|
All
|
All
|
126 (28.9)
|
14 (25.9)
|
All
|
All
|
|
Age at Onset
|
All
|
All
|
All
|
All
|
51 (5.2)
|
24 (11.4)
|
All
|
All
|
25 (2.4)
|
6 (4.8)
|
All
|
All
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
27 (6.6)
|
9 (7.4)
|
All
|
All
|
1 (0.8)
|
None
|
All
|
All
|
18 (4.1)
|
9 (16.7)
|
All
|
All
|
|
Disease Duration
|
All
|
All
|
All
|
All
|
51 (5.2)
|
24 (11.4)
|
All
|
All
|
25 (2.4)
|
6 (4.8)
|
All
|
All
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
27 (6.6)
|
9 (7.4)
|
All
|
All
|
1 (0.8)
|
None
|
All
|
All
|
18 (4.1)
|
9 (16.7)
|
All
|
All
|
|
Flares
|
All
|
All
|
All
|
All
|
119 (12.2)
|
34 (16.2)
|
All
|
All
|
72 (7.0)
|
6 (4.8)
|
All
|
All
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
32 (7.8)
|
14 (11.5)
|
All
|
All
|
3 (2.4)
|
2 (7.1)
|
All
|
All
|
20 (4.6)
|
9 (16.7)
|
All
|
All
|
|
Tophi
|
All
|
All
|
All
|
All
|
210 (21.5)
|
42 (20.0)
|
All
|
All
|
476 (46.1)
|
50 (40.3)
|
All
|
All
|
None
|
None
|
2 (0.9)
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
59 (14.5)
|
30 (24.6)
|
All
|
All
|
50 (40.3)
|
7 (25.0)
|
All
|
All
|
39 (8.9)
|
5 (9.3)
|
All
|
All
|
|
PRS
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
|
Prophylaxis
|
All
|
All
|
All
|
All
|
918 (93.9)
|
196 (93.3)
|
763 (99.9)
|
All
|
317 (30.7)
|
43 (34.7)
|
All
|
All
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
348 (85.3)
|
108 (88.5)
|
240 (99.6)
|
All
|
None
|
2 (7.1)
|
43 (97.7)
|
All
|
390 (89.4)
|
45 (83.3)
|
All
|
179 (99.4)
|
|
BMI
|
30 (0.4)
|
8 (0.8)
|
580 (0.3)
|
606 (0.3)
|
58 (5.9)
|
18 (8.6)
|
174 (22.8)
|
53 (8.6)
|
44 (4.3)
|
2 (1.6)
|
All
|
All
|
5 (0.6)
|
None
|
1 (0.4)
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
7 (1.7)
|
4 (3.3)
|
3 (1.2)
|
13 (3.4)
|
2 (1.6)
|
None
|
4 (9.1)
|
6 (16.2)
|
10 (2.3)
|
3 (5.6)
|
3 (1.5)
|
8 (4.4)
|
|
Hypertension
|
None
|
None
|
None
|
None
|
383 (39.2)
|
36 (17.1)
|
486 (63.6)
|
322 (52.0)
|
365 (35.4)
|
35 (28.2)
|
All
|
All
|
None
|
None
|
1 (0.4)
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
120 (29.4)
|
14 (11.5)
|
171 (71.0)
|
252 (66.0)
|
30 (24.2)
|
3 (10.7)
|
32 (72.7)
|
28 (75.7)
|
196 (45.0)
|
13 (24.1)
|
158 (79.4)
|
135 (75.0)
|
|
Type 2 Diabetes
|
796 (10.0)
|
95 (10.1)
|
18,200 (10.9)
|
25,918 (12.8)
|
72 (7.4)
|
17 (8.1)
|
312 (40.8)
|
266 (43.0)
|
181 (17.5)
|
29 (23.4)
|
All
|
All
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
11 (2.7)
|
1 (0.8)
|
9 (3.7)
|
18 (4.7)
|
86 (69.4)
|
18 (64.3)
|
37 (84.1)
|
36 (97.3)
|
15 (3.4)
|
None
|
6 (3.0)
|
8 (4.4)
|
|
Heart Disease
|
None
|
None
|
None
|
None
|
209 (21.4)
|
32 (15.2)
|
379 (49.6)
|
292 (47.2)
|
518 (50.2)
|
52 (41.9)
|
All
|
All
|
None
|
None
|
1 (0.4)
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
45 (11.0)
|
11 (9.0)
|
28 (11.6)
|
60 (15.7)
|
46 (37.1)
|
7 (25.0)
|
29 (65.9)
|
24 (64.9)
|
34 (7.8)
|
4 (7.4)
|
46 (23.1)
|
13 (7.2)
|
|
Kidney Disease
|
361 (4.6)
|
47 (5.0)
|
7,628 (4.6)
|
9,486 (4.7)
|
222 (22.7)
|
43 (20.5)
|
383 (50.1)
|
261 (42.2)
|
512 (49.6)
|
44 (35.5)
|
All
|
All
|
5 (0.6)
|
None
|
2 (0.9)
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
79 (19.4)
|
25 (20.5)
|
191 (79.3)
|
308 (80.6)
|
94 (75.8)
|
16 (57.1)
|
38 (86.4)
|
29 (78.4)
|
73 (16.7)
|
9 (16.7)
|
160 (80.4)
|
141 (78.3)
|
|
Dyslipidemia
|
None
|
None
|
None
|
None
|
376 (38.4)
|
71 (33.8)
|
438 (57.3)
|
214 (34.6)
|
255 (24.7)
|
35 (28.2)
|
All
|
All
|
None
|
None
|
None
|
None
|
None
|
None
|
1 (0.6)
|
None
|
None
|
None
|
139 (34.1)
|
39 (32.0)
|
171 (71.0)
|
269 (70.4)
|
53 (42.7)
|
7 (25.0)
|
28 (63.6)
|
29 (78.4)
|
150 (34.4)
|
14 (25.9)
|
147 (73.9)
|
115 (63.9)
|
|
Stroke
|
None
|
None
|
None
|
None
|
298 (30.5)
|
62 (29.5)
|
267 (34.9)
|
90 (14.5)
|
603 (58.4)
|
73 (58.9)
|
All
|
All
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
None
|
63 (15.4)
|
16 (13.1)
|
32 (13.3)
|
49 (12.8)
|
66 (53.2)
|
11 (39.3)
|
31 (70.5)
|
26 (70.3)
|
48 (11.0)
|
5 (9.3)
|
44 (22.1)
|
14 (7.8)
|
|
Alcoholic Drinks / Week
|
1,109 (14.0)
|
448 (47.4)
|
34,518 (20.6)
|
70,722 (35.0)
|
None
|
None
|
123 (16.1)
|
202 (32.6)
|
509 (49.3)
|
57 (46.0)
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
34 (8.3)
|
3 (2.5)
|
2 (0.8)
|
7 (1.8)
|
None
|
None
|
None
|
None
|
33 (7.6)
|
2 (3.7)
|
3 (1.5)
|
1 (0.6)
|
|
Sugar-Sweetened Drinks / Week
|
All
|
All
|
All
|
All
|
106 (10.8)
|
27 (12.9)
|
172 (22.5)
|
51 (8.2)
|
762 (73.8)
|
94 (75.8)
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
75 (18.4)
|
17 (13.9)
|
6 (2.5)
|
10 (2.6)
|
None
|
None
|
None
|
None
|
87 (20.0)
|
4 (7.4)
|
4 (2.0)
|
1 (0.6)
|
|
Current Smoker
|
None
|
None
|
None
|
None
|
463 (47.3)
|
124 (59.0)
|
262 (34.3)
|
142 (22.9)
|
464 (45.0)
|
47 (37.9)
|
All
|
All
|
All
|
All
|
2 (0.9)
|
None
|
None
|
None
|
None
|
None
|
3 (2.8)
|
None
|
211 (51.7)
|
69 (56.6)
|
100 (41.5)
|
190 (49.7)
|
57 (46.0)
|
9 (32.1)
|
29 (65.9)
|
25 (67.6)
|
312 (71.6)
|
21 (38.9)
|
92 (46.2)
|
69 (38.3)
|
|
Family History of Gout
|
All
|
All
|
All
|
All
|
71 (7.3)
|
21 (10.0)
|
408 (53.4)
|
298 (48.1)
|
244 (23.6)
|
19 (15.3)
|
35 (79.5)
|
71 (89.9)
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
56 (13.7)
|
15 (12.3)
|
42 (17.4)
|
58 (15.2)
|
8 (6.5)
|
2 (7.1)
|
3 (6.8)
|
4 (10.8)
|
54 (12.4)
|
2 (3.7)
|
26 (13.1)
|
23 (12.8)
|
|
No. Relatives w/ Gout
|
All
|
All
|
All
|
All
|
282 (28.8)
|
64 (30.5)
|
531 (69.5)
|
459 (74.2)
|
643 (62.3)
|
84 (67.7)
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
All
|
104 (25.5)
|
30 (24.6)
|
65 (27.0)
|
88 (23.0)
|
36 (29.0)
|
8 (28.6)
|
17 (38.6)
|
10 (27.0)
|
84 (19.3)
|
13 (24.1)
|
45 (22.6)
|
46 (25.6)
|
|
Note:
|
|
‘All’ = all missing, ‘None’ = none missing
|
So now the following is true about the final cohort:
Age at collection and the PRS are both fully phenotyped
Serum urate has less than 10% missingness in all cohorts except the GlobalGout controls (80% missing)
ULT has around 30-40% missingness in most gout cohorts except 4 of the 5 Ardea cohorts and the UK Biobank gout cohort
The three severity traits are completely available in all Ardea cohorts and completely missing in the UK Biobank. In the remaining cohorts, age at onset/disease duration has around 5 - 10% missingness, flares have around 5 - 15% missingness, and tophi have 10 - 20% missingness for Aus/NZ and around 45% missingness in GlobalGout
Prophylaxis, comorbidity, lifestyle and family history data are poorly phenotyped overall
Given the relatively low proportions of missingness for our most important variables, we are probably okay to just use complete cases only for each model. This will of course reduce power, and may introduce bias which I need to be okay with as a limitation.
Characteristics of each Cohort
table1 <- tibble("Cohort" = cohortstring,
"N" = unlist(lapply(data_list2, nrow)),
"Age at Collection (years)" = unlist(lapply(data_list2, function(x) report(x$AGECOL))),
"Serum Urate (mg/dL)" = unlist(lapply(data_list2, function(x) report(x$URATE))),
"ULT" = unlist(lapply(data_list2, function(x) sumreport(x$ULT))),
"Age at Onset (years)" = unlist(lapply(data_list2, function(x) report(x$AGE1ATK))),
"Disease Duration (years)" = unlist(lapply(data_list2, function(x) report(x$DURATION))),
"Number of Flares in Last Year" = unlist(lapply(data_list2, function(x) report_median(x$NUMATK))),
"Presence of Tophi" = unlist(lapply(data_list2, function(x) sumreport(x$TOPHIGOUT))),
"PRS" = unlist(lapply(data_list2, function(x) report(x$PRS))),
"Prophylaxis" = unlist(lapply(data_list2, function(x) sumreport(x$PROPHY))),
"BMI" = unlist(lapply(data_list2, function(x) report(x$BMI))),
"Hypertension" = unlist(lapply(data_list2, function(x) sumreport(x$HYPERTENSION))),
"Type 2 Diabetes" = unlist(lapply(data_list2, function(x) sumreport(x$DIABETES))),
"Heart Disease" = unlist(lapply(data_list2, function(x) sumreport(x$HEART))),
"Kidney Disease" = unlist(lapply(data_list2, function(x) sumreport(x$KIDNEY))),
"Dyslipidemia" = unlist(lapply(data_list2, function(x) sumreport(x$LIPIDS))),
"Stroke" = unlist(lapply(data_list2, function(x) sumreport(x$STROKE))),
"Alcoholic Drinks / Week" = unlist(lapply(data_list2, function(x) report(x$TOTALALC))),
"Sugar-Sweetened Drinks / Week" = unlist(lapply(data_list2, function(x) report(x$SUGDRINK))),
"Current Smoker" = unlist(lapply(data_list2, function(x) sumreport(x$CURSMOKE))),
"Family History of Gout" = unlist(lapply(data_list2, function(x) sumreport(x$FAMGOUT))),
"No. Relatives w/ Gout" = unlist(lapply(data_list2, function(x) report(x$FAMGOUTNUM))))
table1 <- transpose_df(table1) %>%
column_to_rownames(var = "Cohort") %>%
mutate(across(.cols = 1:ncol(table1), ~ str_replace(string = .x, pattern = " ", replacement = " ")))
row.names(table1) <- str_replace(row.names(table1), " ", " ")
table1 %>%
kable(col.names = clean_names,
align = "c",
escape = F) %>%
kable_styling("striped") %>%
scroll_box(width = "900px", height = "475px") %>%
footnote("Flares reported as median (inter-quartile range). All other numeric variables reported as mean ± sd. All binary variables reported as N (%).")
|
|
UK Biobank Gout Male
|
UK Biobank Gout Female
|
UK Biobank Control Male
|
UK Biobank Control Female
|
Aus/NZ European Gout Male
|
Aus/NZ European Gout Female
|
Aus/NZ European Control Male
|
Aus/NZ European Control Female
|
GlobalGout Gout Male
|
GlobalGout Gout Female
|
GlobalGout Control Male
|
GlobalGout Control Female
|
Ardea LASSO Gout Male
|
Ardea LASSO Gout Female
|
Ardea CLEAR1 Gout Male
|
Ardea CLEAR1 Gout Female
|
Ardea CLEAR2 Gout Male
|
Ardea CLEAR2 Gout Female
|
Ardea CRYSTAL Gout Male
|
Ardea CRYSTAL Gout Female
|
Ardea LIGHT Gout Male
|
Ardea LIGHT Gout Female
|
East Polynesian Gout Male
|
East Polynesian Gout Female
|
East Polynesian Control Male
|
East Polynesian Control Female
|
East Polynesian Gout Male NP
|
East Polynesian Gout Female NP
|
East Polynesian Control Male NP
|
East Polynesian Control Female NP
|
West Polynesian Gout Male
|
West Polynesian Gout Female
|
West Polynesian Control Male
|
West Polynesian Control Female
|
|
N
|
7930
|
945
|
167249
|
202100
|
978
|
210
|
764
|
619
|
1032
|
124
|
44
|
79
|
819
|
65
|
230
|
16
|
239
|
7
|
175
|
4
|
109
|
8
|
408
|
122
|
241
|
382
|
124
|
28
|
44
|
37
|
436
|
54
|
199
|
180
|
|
Age at Collection (years)
|
59.9 ± 7.0
|
61.9 ± 6.1
|
57.0 ± 8.1
|
56.7 ± 7.9
|
62.4 ± 12.4
|
70.0 ± 12.7
|
54.9 ± 17.1
|
51.3 ± 17.3
|
60.1 ± 13.1
|
67.6 ± 11.1
|
60.0 ± 14.8
|
64.2 ± 11.5
|
51.4 ± 11.8
|
60.7 ± 10.6
|
52.3 ± 11.1
|
61.4 ± 7.4
|
53.0 ± 10.8
|
55.0 ± 16.0
|
53.9 ± 11.0
|
63.8 ± 5.4
|
53.3 ± 11.8
|
64.0 ± 16.1
|
54.3 ± 12.4
|
60.7 ± 11.7
|
43.9 ± 15.6
|
45.6 ± 14.8
|
59.7 ± 11.3
|
59.1 ± 13.3
|
49.8 ± 13.5
|
48.7 ± 17.1
|
47.5 ± 12.3
|
53.4 ± 13.4
|
39.3 ± 15.0
|
40.5 ± 15.4
|
|
Serum Urate (mg/dL)
|
6.7 ± 1.7
|
6.0 ± 2.0
|
5.9 ± 1.2
|
4.5 ± 1.1
|
6.7 ± 1.9
|
6.4 ± 2.3
|
5.5 ± 2.8
|
3.3 ± 2.6
|
7.4 ± 2.3
|
7.7 ± 2.6
|
6.6 ± 1.7
|
6.6 ± 1.7
|
8.9 ± 1.3
|
8.9 ± 1.4
|
7.9 ± 1.4
|
8.1 ± 1.2
|
7.9 ± 1.5
|
8.4 ± 2.0
|
8.8 ± 1.5
|
10.1 ± 1.2
|
9.3 ± 1.7
|
8.1 ± 1.4
|
7.0 ± 2.3
|
6.3 ± 2.5
|
6.5 ± 1.9
|
5.4 ± 1.7
|
7.0 ± 1.7
|
6.9 ± 2.4
|
6.3 ± 1.3
|
5.3 ± 1.3
|
7.7 ± 2.1
|
7.0 ± 2.7
|
6.6 ± 1.8
|
5.3 ± 1.7
|
|
ULT
|
4505 (56.8)
|
372 (39.4)
|
NA
|
NA
|
564 (99.6)
|
108 (98.2)
|
NA
|
NA
|
570 (75.7)
|
49 (59.8)
|
NA
|
NA
|
255 (31.2)
|
26 (40.6)
|
230 (100.0)
|
16 (100.0)
|
239 (100.0)
|
7 (100.0)
|
94 (100.0)
|
1 (100.0)
|
109 (100.0)
|
8 (100.0)
|
262 (92.6)
|
86 (96.6)
|
NA
|
NA
|
96 (93.2)
|
19 (73.1)
|
NA
|
NA
|
292 (94.2)
|
38 (95.0)
|
NA
|
NA
|
|
Age at Onset (years)
|
NA
|
NA
|
NA
|
NA
|
46.4 ± 15.8
|
59.5 ± 15.7
|
NA
|
NA
|
46.5 ± 14.0
|
57.8 ± 12.5
|
NA
|
NA
|
41.4 ± 13.4
|
55.1 ± 12.0
|
41.9 ± 12.4
|
55.2 ± 11.2
|
42.6 ± 13.2
|
46.1 ± 20.6
|
40.1 ± 13.0
|
61.5 ± 6.2
|
42.4 ± 13.1
|
55.2 ± 17.3
|
37.9 ± 14.0
|
49.4 ± 15.4
|
NA
|
NA
|
39.1 ± 15.2
|
46.0 ± 16.8
|
NA
|
NA
|
34.6 ± 12.0
|
44.3 ± 15.0
|
NA
|
NA
|
|
Disease Duration (years)
|
NA
|
NA
|
NA
|
NA
|
16.8 ± 12.7
|
10.9 ± 10.4
|
NA
|
NA
|
14.5 ± 11.4
|
10.6 ± 9.8
|
NA
|
NA
|
11.0 ± 9.4
|
6.6 ± 8.0
|
11.4 ± 9.4
|
7.1 ± 9.5
|
11.4 ± 9.8
|
9.9 ± 11.6
|
14.8 ± 10.0
|
3.2 ± 1.0
|
11.9 ± 8.7
|
9.8 ± 11.0
|
17.2 ± 12.8
|
13.1 ± 13.2
|
NA
|
NA
|
21.7 ± 15.3
|
14.1 ± 12.6
|
NA
|
NA
|
13.6 ± 10.3
|
9.2 ± 9.2
|
NA
|
NA
|
|
Number of Flares in Last Year
|
NA
|
NA
|
NA
|
NA
|
2 (0 - 4)
|
1.5 (0 - 3.25)
|
NA
|
NA
|
2 (1 - 4)
|
2.5 (1 - 4)
|
NA
|
NA
|
4 (3 - 8)
|
3 (3 - 6)
|
3 (2 - 6)
|
3 (3 - 4)
|
4 (2 - 8)
|
5 (3 - 6)
|
4 (3 - 6)
|
4.5 (2.25 - 6)
|
4 (2 - 10)
|
4 (2.75 - 5.25)
|
3 (1 - 6)
|
2 (0 - 5)
|
NA
|
NA
|
2 (0 - 3)
|
3 (1 - 6)
|
NA
|
NA
|
4 (2 - 10)
|
2 (1 - 5)
|
NA
|
NA
|
|
Presence of Tophi
|
NA
|
NA
|
NA
|
NA
|
333 (43.4)
|
67 (39.9)
|
NA
|
NA
|
320 (57.6)
|
46 (62.2)
|
NA
|
NA
|
138 (16.8)
|
5 (7.7)
|
34 (14.9)
|
1 (6.2)
|
54 (22.6)
|
2 (28.6)
|
174 (99.4)
|
4 (100.0)
|
26 (23.9)
|
5 (62.5)
|
144 (41.3)
|
26 (28.3)
|
NA
|
NA
|
9 (12.2)
|
4 (19.0)
|
NA
|
NA
|
177 (44.6)
|
14 (28.6)
|
NA
|
NA
|
|
PRS
|
4.0 ± 0.6
|
4.0 ± 0.7
|
3.7 ± 0.6
|
3.7 ± 0.6
|
4.1 ± 0.7
|
4.0 ± 0.6
|
3.7 ± 0.6
|
3.7 ± 0.6
|
4.0 ± 0.6
|
4.0 ± 0.6
|
3.8 ± 0.6
|
3.8 ± 0.6
|
4.1 ± 0.7
|
4.1 ± 0.6
|
4.2 ± 0.7
|
4.3 ± 0.6
|
4.2 ± 0.6
|
4.3 ± 0.8
|
4.2 ± 0.6
|
4.0 ± 0.4
|
4.1 ± 0.6
|
4.2 ± 0.2
|
4.4 ± 0.5
|
4.4 ± 0.5
|
4.2 ± 0.4
|
4.2 ± 0.5
|
4.2 ± 0.5
|
4.4 ± 0.5
|
4.2 ± 0.5
|
4.1 ± 0.6
|
4.8 ± 0.6
|
4.7 ± 0.6
|
4.3 ± 0.6
|
4.3 ± 0.6
|
|
Prophylaxis
|
NA
|
NA
|
NA
|
NA
|
56 (93.3)
|
13 (92.9)
|
0 (0.0)
|
NA
|
443 (62.0)
|
55 (67.9)
|
NA
|
NA
|
810 (98.9)
|
65 (100.0)
|
230 (100.0)
|
16 (100.0)
|
239 (100.0)
|
7 (100.0)
|
175 (100.0)
|
4 (100.0)
|
109 (100.0)
|
8 (100.0)
|
59 (98.3)
|
13 (92.9)
|
1 (100.0)
|
NA
|
112 (90.3)
|
23 (88.5)
|
1 (100.0)
|
NA
|
46 (100.0)
|
7 (77.8)
|
NA
|
0 (0.0)
|
|
BMI
|
30.5 ± 4.8
|
32.3 ± 6.6
|
27.7 ± 4.2
|
27.0 ± 5.1
|
30.3 ± 5.2
|
30.9 ± 7.3
|
27.2 ± 4.7
|
27.0 ± 6.2
|
29.4 ± 4.7
|
30.9 ± 6.7
|
NA
|
NA
|
34.1 ± 6.7
|
38.0 ± 10.3
|
34.6 ± 6.1
|
38.1 ± 6.5
|
33.7 ± 6.0
|
36.2 ± 7.5
|
32.2 ± 5.4
|
36.5 ± 3.8
|
31.3 ± 4.9
|
35.7 ± 8.7
|
35.4 ± 8.0
|
38.2 ± 9.8
|
31.9 ± 7.1
|
32.7 ± 8.5
|
35.9 ± 7.7
|
39.5 ± 7.6
|
32.5 ± 7.8
|
29.1 ± 6.1
|
36.1 ± 6.7
|
38.5 ± 9.1
|
33.1 ± 6.2
|
34.3 ± 7.7
|
|
Hypertension
|
5552 (70.0)
|
751 (79.5)
|
66308 (39.6)
|
61099 (30.2)
|
573 (96.3)
|
172 (98.9)
|
169 (60.8)
|
121 (40.7)
|
662 (99.3)
|
89 (100.0)
|
NA
|
NA
|
401 (49.0)
|
48 (73.8)
|
146 (63.8)
|
15 (93.8)
|
166 (69.5)
|
6 (85.7)
|
101 (57.7)
|
4 (100.0)
|
57 (52.3)
|
8 (100.0)
|
267 (92.7)
|
107 (99.1)
|
69 (98.6)
|
119 (91.5)
|
94 (100.0)
|
25 (100.0)
|
12 (100.0)
|
9 (100.0)
|
212 (88.3)
|
39 (95.1)
|
36 (87.8)
|
43 (95.6)
|
|
Type 2 Diabetes
|
1431 (20.1)
|
227 (26.7)
|
13115 (8.8)
|
8479 (4.8)
|
144 (15.9)
|
51 (26.4)
|
55 (12.2)
|
48 (13.6)
|
350 (41.1)
|
53 (55.8)
|
NA
|
NA
|
79 (9.6)
|
16 (24.6)
|
30 (13.0)
|
6 (37.5)
|
32 (13.4)
|
0 (0.0)
|
24 (13.7)
|
2 (50.0)
|
12 (11.0)
|
0 (0.0)
|
121 (30.5)
|
61 (50.4)
|
53 (22.8)
|
77 (21.2)
|
38 (100.0)
|
10 (100.0)
|
7 (100.0)
|
1 (100.0)
|
80 (19.0)
|
26 (48.1)
|
33 (17.1)
|
44 (25.6)
|
|
Heart Disease
|
2144 (27.0)
|
289 (30.6)
|
23067 (13.8)
|
12355 (6.1)
|
320 (41.6)
|
92 (51.7)
|
84 (21.8)
|
37 (11.3)
|
159 (30.9)
|
39 (54.2)
|
NA
|
NA
|
40 (4.9)
|
3 (4.6)
|
16 (7.0)
|
0 (0.0)
|
25 (10.5)
|
0 (0.0)
|
17 (9.7)
|
0 (0.0)
|
5 (4.6)
|
0 (0.0)
|
139 (38.3)
|
65 (58.6)
|
45 (21.1)
|
47 (14.6)
|
40 (51.3)
|
13 (61.9)
|
4 (26.7)
|
2 (15.4)
|
77 (19.2)
|
17 (34.0)
|
12 (7.8)
|
18 (10.8)
|
|
Kidney Disease
|
1079 (14.3)
|
276 (30.7)
|
5574 (3.5)
|
8907 (4.6)
|
359 (47.5)
|
122 (73.1)
|
214 (56.2)
|
251 (70.1)
|
238 (45.8)
|
64 (80.0)
|
NA
|
NA
|
138 (17.0)
|
28 (43.1)
|
28 (12.3)
|
9 (56.2)
|
38 (15.9)
|
4 (57.1)
|
31 (17.7)
|
2 (50.0)
|
13 (11.9)
|
4 (50.0)
|
153 (46.5)
|
71 (73.2)
|
34 (68.0)
|
57 (77.0)
|
28 (93.3)
|
12 (100.0)
|
4 (66.7)
|
8 (100.0)
|
130 (35.8)
|
32 (71.1)
|
21 (53.8)
|
30 (76.9)
|
|
Dyslipidemia
|
4064 (51.2)
|
503 (53.2)
|
47606 (28.5)
|
34238 (16.9)
|
494 (82.1)
|
117 (84.2)
|
169 (51.8)
|
143 (35.3)
|
563 (72.5)
|
71 (79.8)
|
NA
|
NA
|
331 (40.4)
|
37 (56.9)
|
110 (47.8)
|
13 (81.2)
|
98 (41.0)
|
4 (57.1)
|
78 (44.8)
|
3 (75.0)
|
41 (37.6)
|
5 (62.5)
|
235 (87.4)
|
77 (92.8)
|
56 (80.0)
|
90 (79.6)
|
71 (100.0)
|
21 (100.0)
|
16 (100.0)
|
8 (100.0)
|
238 (83.2)
|
36 (90.0)
|
38 (73.1)
|
37 (56.9)
|
|
Stroke
|
641 (8.1)
|
101 (10.7)
|
6727 (4.0)
|
4942 (2.4)
|
48 (7.1)
|
16 (10.8)
|
138 (27.8)
|
214 (40.5)
|
41 (9.6)
|
8 (15.7)
|
NA
|
NA
|
7 (0.9)
|
1 (1.5)
|
3 (1.3)
|
0 (0.0)
|
1 (0.4)
|
0 (0.0)
|
3 (1.7)
|
0 (0.0)
|
0 (0.0)
|
0 (0.0)
|
24 (7.0)
|
13 (12.3)
|
12 (5.7)
|
19 (5.7)
|
2 (3.4)
|
1 (5.9)
|
0 (0.0)
|
0 (0.0)
|
12 (3.1)
|
5 (10.2)
|
4 (2.6)
|
3 (1.8)
|
|
Alcoholic Drinks / Week
|
7.6 ± 5.4
|
6.0 ± 5.1
|
6.6 ± 5.2
|
5.8 ± 5.0
|
7.8 ± 10.5
|
2.4 ± 5.1
|
4.8 ± 9.6
|
2.7 ± 4.2
|
14.1 ± 19.2
|
4.4 ± 7.5
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
5.5 ± 14.3
|
1.9 ± 7.2
|
5.3 ± 10.9
|
2.4 ± 5.8
|
5.9 ± 8.6
|
2.2 ± 4.4
|
3.2 ± 6.0
|
3.8 ± 7.6
|
4.2 ± 9.0
|
0.9 ± 2.7
|
4.3 ± 11.0
|
1.2 ± 3.7
|
|
Sugar-Sweetened Drinks / Week
|
NA
|
NA
|
NA
|
NA
|
1.0 ± 1.5
|
0.6 ± 1.1
|
0.9 ± 1.3
|
0.5 ± 1.1
|
0.8 ± 1.3
|
0.7 ± 1.2
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
1.7 ± 1.9
|
1.0 ± 1.5
|
1.8 ± 2.5
|
1.2 ± 1.6
|
1.3 ± 1.7
|
0.8 ± 1.3
|
2.1 ± 2.2
|
1.4 ± 2.0
|
2.3 ± 2.2
|
1.4 ± 1.5
|
2.0 ± 1.7
|
1.4 ± 1.5
|
|
Current Smoker
|
482 (6.1)
|
74 (7.8)
|
14708 (8.8)
|
13750 (6.8)
|
22 (4.3)
|
5 (5.8)
|
26 (5.2)
|
20 (4.2)
|
94 (16.5)
|
14 (18.2)
|
NA
|
NA
|
NA
|
NA
|
40 (17.5)
|
0 (0.0)
|
30 (12.6)
|
0 (0.0)
|
35 (20.0)
|
0 (0.0)
|
19 (17.9)
|
0 (0.0)
|
35 (17.8)
|
4 (7.5)
|
41 (29.1)
|
45 (23.4)
|
16 (23.9)
|
3 (15.8)
|
4 (26.7)
|
4 (33.3)
|
12 (9.7)
|
2 (6.1)
|
23 (21.5)
|
15 (13.5)
|
|
Family History of Gout
|
NA
|
NA
|
NA
|
NA
|
402 (44.3)
|
89 (47.1)
|
58 (16.3)
|
76 (23.7)
|
270 (34.3)
|
40 (38.1)
|
3 (33.3)
|
2 (25.0)
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
239 (67.9)
|
80 (74.8)
|
87 (43.7)
|
148 (45.7)
|
79 (68.1)
|
22 (84.6)
|
15 (36.6)
|
19 (57.6)
|
237 (62.0)
|
32 (61.5)
|
66 (38.2)
|
61 (38.9)
|
|
No. Relatives w/ Gout
|
NA
|
NA
|
NA
|
NA
|
0.8 ± 1.0
|
1.0 ± 1.3
|
0.3 ± 0.6
|
0.5 ± 0.7
|
0.7 ± 0.8
|
0.9 ± 0.8
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
1.7 ± 2.2
|
1.8 ± 1.5
|
0.8 ± 1.3
|
0.7 ± 1.0
|
2.0 ± 1.9
|
2.2 ± 1.5
|
0.8 ± 1.0
|
1.3 ± 1.4
|
1.5 ± 1.9
|
1.5 ± 1.9
|
0.6 ± 0.9
|
0.6 ± 0.9
|
|
Note:
|
|
Flares reported as median (inter-quartile range). All other numeric variables reported as mean ± sd. All binary variables reported as N (%).
|
From this table we learn the following:
In general, gout cohorts consist of approximately 80 - 90% males, while control cohorts are closer to 50% male.
Male European gout cohorts are mostly 60 years old on average, except for Ardea cohorts who are younger (around 50-55). Male East Polynesian gout cohorts are around 50 - 60 years old, while the male West Polynesian gout cases are younger (average of 47 years old). Female gout cohorts are consistently 5-10 years older than their male counterparts. Controls are generally 5 to 10 years younger than cases.
Serum urate levels for gout cohorts tend to be between 6.5 and 9 mg/dL, while control cohorts sit around 4 to 6 mg/dL on average. Females in general have lower serum urate than males.
Most gout cohorts have high rates of ULT (many are over 90% on ULT - though this is likely related to missing data issues). Similar ULT usage is seen in males and females, though perhaps slightly less females are prescribed ULT than males.
Mean onset is around 40 - 50 for European cohorts and around 35 - 40 for Polynesians. Average disease duration is around 10 - 20 years for all cohorts. All females have a much higher age at onset compared to males and a shorter disease duration.
Gout cohorts have up to 10 flares per year on average. No obvious difference in flares between males and females, perhaps slightly lower in Polynesian females vs Polynesian males.
Gout cohorts tend to have between 15 and 50% tophaceous gout. Inconsistent decrease in number of tophi in females vs males.
The PRS sits around 4.0 to 4.4 for European gout cohorts, around 3.7 for European control cohorts, around 4.4 for East Polynesian gout cohorts, 4.8 for West Polynesian gout cohorts and around 4.2 to 4.3 for Polynesian control cohorts. No obvious difference between males and females.
For cohorts with prophylaxis data, almost all gout cases are on prophylaxis. No obvious sex differences.
European gout cases have a mean BMI of between 30 and 35 on average, while European controls are closer to 27 on average. Polynesian cases have a mean BMI of 36 while controls have around 33 on average. Female gout cases consistently have higher BMI and higher rates of all comorbidities than their male counterparts, while female controls have lower BMI and lower rates of all comorbidities than male controls.
Gout cohorts are around 50 - 100% hypertensive, while European control cohorts are around 35 - 50% and Polynesian control cohorts are 90 - 95% hypertensive
Type 2 diabetes seems to be more common in gout cohorts, at between 10 and 40%, while it sits at around 7 - 13% for European controls, and around 23% for Polynesian controls
Heart disease is around 20 - 45% frequency in gout cohorts, though Ardea cohorts all show lower amounts at around 5 - 10% each. The control cohorts sit around 10 - 20% each
Kidney disease is relatively common among the Aus/NZ cohorts and the GlobalGout cohort, but not as common among the Ardea cohorts or UK Biobank. This may be because mainly the people involved in renal disease studies had kidney disease measurements
Dyslipidemia clearly associates with gout status, sitting at around 20 to 42% in European control cohorts vs 50 to 80% in European gout cohorts. Ardea cohorts have closer to 40 to 50% dyslipidemia however. The Polynesian cohorts appear to have very high proportions of dyslipidemia, particularly in the gout cohorts.
Stroke proportions are mostly low, except for the Aus/NZ control cohort who have very high proportions of this disease, and in general stroke associates with gout status.
Alcoholic drinks do associate with gout status, and generally Polynesians drink less than Europeans. Males tend to drink much more than females, regardless of gout status, but male gout vs controls have higher rates of drinking while females are no different between cases and controls.
Sugar sweetened drinks also seem to associate slightly with gout status, and Polynesians drink more of these than Europeans. Sugar-sweetened bevarages seems to be gout associated in all Europeans and West Polynesians but not East Polynesians. Also females tend to drink less of these than men.
Smoking status shows no correlation with gout status but does seem to be higher rates in GlobalGout, Ardea, and Polynesian cohorts. Smoking status may be positively gout associated in females but negatively associated in males (European specific).
Family history of gout is clearly correlated with gout status, and is much higher in Polynesians in general, which is also reflected by mean no. relatives with gout. As expected, family history of gout associates with gout status independent of sex.
Based on this information, I will take the following action:
Remove any cohort with fewer than ~30 individuals (i.e. female Ardea cohorts) as they are unlikely to contribute to the meta-analyses.
Ignore any phenotypes that are not phenotyped at more than 80% across all cohorts, given that there are likely biases as to inclusion in phenotyping (i.e. negative ULT cases reported as missing in some cohorts).
Plotting distribution of each variable
# Plotting distributions of each of these variables in males and females separately, colored by cohort
all_cohorts <- all_pheno_prs2 %>%
mutate(SEX = factor(SEX, levels = c("Male", "Female")),
GOUT2 = factor(GOUT, levels = c(TRUE, FALSE), labels = c("Gout", "Control")),
GROUP = factor(case_when(GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "European Gout",
!GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "European Control",
GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study != "Ngati Porou" ~ "East Polynesian Gout",
!GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study != "Ngati Porou" ~ "East Polynesian Control",
GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study == "Ngati Porou" ~ "East Polynesian Gout - NP",
!GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study == "Ngati Porou" ~ "East Polynesian Control - NP",
GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian Gout",
!GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian Control"),
levels = c("European Gout", "European Control", "East Polynesian Gout", "East Polynesian Control", "East Polynesian Gout - NP", "East Polynesian Control - NP", "West Polynesian Gout", "West Polynesian Control")),
GROUP2 = factor(case_when(GROUP == "European Gout" & SEX == "Male" ~ "European Gout - male",
GROUP == "European Gout" & SEX == "Female" ~ "European Gout - female",
GROUP == "European Control" & SEX == "Male" ~ "European Control - male",
GROUP == "European Control" & SEX == "Female" ~ "European Control - female",
GROUP == "East Polynesian Gout" & SEX == "Male" ~ "East Polynesian Gout - male",
GROUP == "East Polynesian Gout" & SEX == "Female" ~ "East Polynesian Gout - female",
GROUP == "East Polynesian Control" & SEX == "Male" ~ "East Polynesian Control - male",
GROUP == "East Polynesian Control" & SEX == "Female" ~ "East Polynesian Control - female",
GROUP == "East Polynesian Gout - NP" & SEX == "Male" ~ "East Polynesian Gout - NP - male",
GROUP == "East Polynesian Gout - NP" & SEX == "Female" ~ "East Polynesian Gout - NP - female",
GROUP == "East Polynesian Control - NP" & SEX == "Male" ~ "East Polynesian Control - NP - male",
GROUP == "East Polynesian Control - NP" & SEX == "Female" ~ "East Polynesian Control - NP - female",
GROUP == "West Polynesian Gout" & SEX == "Male" ~ "West Polynesian Gout - male",
GROUP == "West Polynesian Gout" & SEX == "Female" ~ "West Polynesian Gout - female",
GROUP == "West Polynesian Control" & SEX == "Male" ~ "West Polynesian Control - male",
GROUP == "West Polynesian Control" & SEX == "Female" ~ "West Polynesian Control - female"),
levels = c("European Gout - male", "European Gout - female", "European Control - male", "European Control - female", "East Polynesian Gout - male", "East Polynesian Gout - female", "East Polynesian Control - male", "East Polynesian Control - female", "East Polynesian Gout - NP - male", "East Polynesian Gout - NP - female", "East Polynesian Control - NP - male", "East Polynesian Control - NP - female", "West Polynesian Gout - male", "West Polynesian Gout - female", "West Polynesian Control - male", "West Polynesian Control - female")),
GROUP3 = factor(case_when(GOUT & SEX == "Male" ~ "Male Gout",
GOUT & SEX == "Female" ~ "Female Gout",
!GOUT & SEX == "Male" ~ "Male Control",
!GOUT & SEX == "Female" ~ "Female Control"),
levels = c("Male Gout", "Female Gout", "Male Control", "Female Control")),
COHORT2 = factor(case_when(GOUT & Pheno.Study == "UK Biobank" ~ "UK Biobank - Gout",
!GOUT & Pheno.Study == "UK Biobank" ~ "UK Biobank - Control",
GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") & Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease") ~ "Aus/NZ - Gout",
!GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") & Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease") ~ "Aus/NZ - Control",
GOUT & Pheno.Study == "EuroGout" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "GlobalGout - Gout",
!GOUT & Pheno.Study == "EuroGout" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "GlobalGout - Control",
Pheno.Study == "Ardea: 401" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - LASSO",
Pheno.Study == "Ardea: CLEAR1" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - CLEAR1",
Pheno.Study == "Ardea: CLEAR2" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - CLEAR2",
Pheno.Study == "Ardea: CRYSTAL" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - CRYSTAL",
Pheno.Study == "Ardea: LIGHT" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - LIGHT",
GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study != "Ngati Porou" ~ "East Polynesian - Gout",
!GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study != "Ngati Porou" ~ "East Polynesian - Control",
GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study == "Ngati Porou" ~ "East Polynesian - Gout - NP",
!GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study == "Ngati Porou" ~ "East Polynesian - Control - NP",
GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian - Gout",
!GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian - Control"),
levels = c("UK Biobank - Gout", "UK Biobank - Control", "Aus/NZ - Gout", "Aus/NZ - Control", "GlobalGout - Gout", "GlobalGout - Control", "Ardea - LASSO", "Ardea - CLEAR1", "Ardea - CLEAR2", "Ardea - CRYSTAL", "Ardea - LIGHT", "East Polynesian - Gout", "East Polynesian - Control", "East Polynesian - Gout - NP", "East Polynesian - Control - NP", "West Polynesian - Gout", "West Polynesian - Control"))) %>%
filter(!is.na(COHORT2))
Age at collection
# Age at collection
all_cohorts %>%
ggplot(aes(x = AGECOL, y = COHORT2, color = SEX)) +
geom_boxplot(position = position_dodge2(reverse = T)) +
labs(x = "Age at Collection (years)") +
theme(axis.title.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.title = element_blank()) +
scale_x_continuous(limits = c(0, 100)) +
scale_y_discrete(limits = rev(levels(all_cohorts$COHORT2)))

Serum urate
# Serum urate
all_cohorts %>%
filter(!is.na(URATE)) %>%
ggplot(aes(x = URATE, y = COHORT2, color = SEX)) +
geom_boxplot(position = position_dodge2(reverse = T)) +
labs(x = "Serum Urate (mg/dL)") +
theme(axis.title.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.title = element_blank()) +
scale_x_continuous(limits = c(0, max(all_cohorts$URATE, na.rm = T))) +
scale_y_discrete(limits = rev(levels(all_cohorts$COHORT2)))

Urate-lowering therapy
# ULT
all_cohorts %>%
filter(GOUT) %>%
mutate(ULT = factor(case_when(ULT ~ "On ULT",
!ULT ~ "Not on ULT",
is.na(ULT) ~ "No Data"),
levels = c("No Data", "Not on ULT", "On ULT"))) %>%
group_by(COHORT2, ULT, SEX) %>%
summarize(value = n()) %>%
ggplot(aes(x = COHORT2, y = value, fill = ULT, color = COHORT2)) +
geom_bar(position = "fill", stat = "identity") +
facet_wrap(~ SEX) +
scale_fill_discrete(type = c("black", "#C0C0C0", "#505050"), limits = c("On ULT", "Not on ULT", "No Data")) +
theme(axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.title = element_blank())

Age at onset
# Age at onset
all_cohorts %>%
filter(GOUT, !is.na(AGE1ATK)) %>%
ggplot(aes(x = AGE1ATK, y = COHORT2, color = SEX)) +
geom_boxplot(position = position_dodge2(reverse = T)) +
labs(x = "Age at Onset (years)") +
theme(axis.title.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.title = element_blank()) +
scale_x_continuous(limits = c(0, 100)) +
scale_y_discrete(limits = rev(levels(all_cohorts %>% filter(GOUT, !is.na(AGE1ATK)) %>% mutate(COHORT2 = factor(COHORT2)) %>% pull(COHORT2))))

Disease duration
# Disease duration
all_cohorts %>%
filter(GOUT, !is.na(DURATION)) %>%
ggplot(aes(x = DURATION, y = COHORT2, color = SEX)) +
geom_boxplot(position = position_dodge2(reverse = T)) +
labs(x = "Disease Duration (years)") +
theme(axis.title.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.title = element_blank()) +
scale_x_continuous(limits = c(0, max(all_cohorts$DURATION, na.rm = T))) +
scale_y_discrete(limits = rev(levels(all_cohorts %>% filter(GOUT, !is.na(DURATION)) %>% mutate(COHORT2 = factor(COHORT2)) %>% pull(COHORT2))))

Flare frequency
# Flare frequency
all_cohorts %>%
filter(GOUT, !is.na(NUMATK)) %>%
mutate(NUMATK = case_when(NUMATK > 52 ~ 52,
TRUE ~ NUMATK)) %>%
ggplot(aes(x = NUMATK, y = COHORT2, color = SEX)) +
geom_boxplot(position = position_dodge2(reverse = T)) +
labs(x = "Number of Flares in Last Year") +
theme(axis.title.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.title = element_blank()) +
scale_x_continuous(limits = c(0, 52)) +
scale_y_discrete(limits = rev(levels(all_cohorts %>% filter(GOUT, !is.na(NUMATK)) %>% mutate(COHORT2 = factor(COHORT2)) %>% pull(COHORT2))))

Flare frequency (categorical)
all_cohorts %>%
filter(GOUT) %>%
mutate(FLARE_CAT = factor(case_when(is.na(FLARE_CAT) ~ "No Data",
TRUE ~ as.character(FLARE_CAT)),
levels = rev(c(paste0(0:5),
"6 - 11",
"12 - 52",
"No Data")),
ordered = TRUE)) %>%
group_by(COHORT2, FLARE_CAT, SEX) %>%
summarize(value = n()) %>%
ggplot(aes(x = COHORT2, y = value, fill = FLARE_CAT, color = COHORT2)) +
geom_bar(position = "fill", stat = "identity") +
facet_wrap(~ SEX) +
scale_fill_discrete(type = c("#C0C0C0", "#FDE725FF", "#9FDA3AFF", "#4AC16DFF", "#1FA187FF", "#277F8EFF", "#365C8DFF", "#46337EFF", "#440154FF")) +
theme(axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.title = element_blank())

Tophi
# Tophi
all_cohorts %>%
filter(GOUT) %>%
mutate(TOPHIGOUT = factor(case_when(TOPHIGOUT ~ "Tophi",
!TOPHIGOUT ~ "No Tophi",
is.na(TOPHIGOUT) ~ "No Data"),
levels = c("No Data", "No Tophi", "Tophi"))) %>%
group_by(COHORT2, TOPHIGOUT, SEX) %>%
summarize(value = n()) %>%
ggplot(aes(x = COHORT2, y = value, fill = TOPHIGOUT, color = COHORT2)) +
geom_bar(position = "fill", stat = "identity") +
facet_wrap(~ SEX) +
scale_fill_discrete(type = c("black", "#C0C0C0", "#505050"), limits = c("Tophi", "No Tophi", "No Data")) +
theme(axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.title = element_blank())

PRS
# PRS
all_cohorts %>%
ggplot(aes(x = PRS, y = COHORT2, color = SEX)) +
geom_boxplot(position = position_dodge2(reverse = T)) +
labs(x = "Gout PRS") +
theme(axis.title.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.title = element_blank()) +
scale_y_discrete(limits = rev(levels(all_cohorts$COHORT2)))

Plotting relationship between age/sex and each outcome of interest
Gout
# Plotting relationship between covariates (age + sex) and each outcome of interest (GOUT, SU, Onset, Duration, Flares, Tophi)
# Gout
all_cohorts %>%
filter(!is.na(GROUP)) %>%
mutate(GROUP = factor(case_when(str_detect(GROUP, "European") ~ "European",
str_detect(GROUP, "East") & Pheno.Study != "Ngati Porou" ~ "East Polynesian",
str_detect(GROUP, "East") & Pheno.Study == "Ngati Porou" ~ "East Polynesian - NP",
str_detect(GROUP, "West") ~ "West Polynesian"),
levels = c("European", "East Polynesian", "East Polynesian - NP", "West Polynesian"))) %>%
ggplot(mapping = aes(x = GOUT2, y = AGECOL, fill = GROUP)) +
stat_summary(fun = mean, geom = "bar", position = position_dodge()) +
stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(width = 0.9), width = 0.3, alpha = 0.75) +
facet_wrap(~ SEX) +
labs(y = "Mean Age at Collection (years)") +
#scale_fill_discrete(limits = c("Gout", "Control")) +
theme(axis.title.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.title = element_blank())

Serum urate v Age
# Serum urate
all_cohorts %>%
filter(!is.na(URATE)) %>%
mutate(ULT2 = factor(case_when(!ULT ~ "Not on ULT",
ULT ~ "On ULT",
is.na(ULT) ~ "No Data / Control"),
levels = c("On ULT", "Not on ULT", "No Data / Control"))) %>%
ggplot(mapping = aes(x = AGECOL, y = URATE, color = GROUP)) +
geom_smooth(method = "lm", se = F) +
facet_wrap(~ SEX * ULT2) +
labs(x = "Mean Age at Collection (years)",
y = "Serum Urate (mg/dL)") +
theme(legend.title = element_blank())

Serum urate v Duration
all_cohorts %>%
filter(!is.na(URATE)) %>%
mutate(ULT2 = factor(case_when(!ULT ~ "Not on ULT",
ULT ~ "On ULT",
is.na(ULT) ~ "No Data / Control"),
levels = c("On ULT", "Not on ULT", "No Data / Control"))) %>%
ggplot(mapping = aes(x = DURATION, y = URATE, color = GROUP)) +
geom_smooth(method = "lm", se = F) +
facet_wrap(~ SEX * ULT2) +
labs(x = "Disease Duration (years)",
y = "Serum Urate (mg/dL)") +
theme(legend.title = element_blank())

ULT
# ULT
all_cohorts %>%
filter(GOUT) %>%
mutate(ULT2 = factor(case_when(!ULT ~ "Not on ULT",
ULT ~ "On ULT",
is.na(ULT) ~ "No Data"),
levels = c("On ULT", "Not on ULT", "No Data"))) %>%
ggplot(mapping = aes(x = ULT2, y = AGECOL, fill = GROUP)) +
stat_summary(fun = mean, geom = "bar", position = position_dodge()) +
stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(width = 0.9), width = 0.3, alpha = 0.75) +
facet_wrap(~ SEX) +
labs(y = "Mean Age at Collection (years)") +
theme(axis.title.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.title = element_blank())

Onset
# Age at onset
all_cohorts %>%
filter(!is.na(AGE1ATK)) %>%
ggplot(mapping = aes(x = AGECOL, y = AGE1ATK, color = COHORT2)) +
geom_smooth(method = "lm", se = F) +
facet_wrap(~ SEX) +
labs(x = "Age at Collection (years)", y = "Age at Onset (years)") +
theme(legend.title = element_blank())

Duration
# Disease duration
all_cohorts %>%
filter(GOUT, !is.na(AGE1ATK)) %>%
ggplot(mapping = aes(x = AGECOL, y = DURATION, color = COHORT2)) +
geom_smooth(method = "lm", se = F) +
facet_wrap(~ SEX) +
labs(x = "Age at Collection (years)", y = "Disease Duration (years)") +
theme(legend.title = element_blank())

Flare frequency (all)
# Flare frequency
all_cohorts %>%
filter(GOUT, !is.na(FLARE_CAT),
!str_detect(COHORT2, "Ardea")) %>%
ggplot(aes(x = FLARE_CAT, y = AGECOL, color = COHORT2)) +
stat_summary(geom = "point", fun = mean) +
stat_summary(geom = "errorbar", fun.data = mean_se, width = 0.3) +
stat_summary(geom = "line", fun = mean, aes(group = COHORT2)) +
facet_wrap(~ SEX) +
labs(x = "Number of Flares in Last Year (categorical)", y = "Age at Collection (years)") +
theme(legend.title = element_blank())

Flare frequency (> 1)
all_cohorts %>%
filter(GOUT, !is.na(FLARE_CAT),
NUMATK >= 2) %>%
ggplot(aes(x = FLARE_CAT, y = AGECOL, color = COHORT2)) +
stat_summary(geom = "point", fun = mean) +
stat_summary(geom = "errorbar", fun.data = mean_se, width = 0.3) +
stat_summary(geom = "line", fun = mean, aes(group = COHORT2)) +
facet_wrap(~ SEX) +
labs(x = "Number of Flares in Last Year (categorical)", y = "Age at Collection (years)") +
theme(legend.title = element_blank())

Tophi
# Tophi
all_cohorts %>%
filter(GOUT,
!str_detect(COHORT2, "CRYSTAL")) %>%
mutate(TOPHIGOUT = factor(case_when(TOPHIGOUT ~ "Tophi",
!TOPHIGOUT ~ "No Tophi",
is.na(TOPHIGOUT) ~ "No Data"),
levels = c("Tophi", "No Tophi", "No Data"))) %>%
ggplot(mapping = aes(x = TOPHIGOUT, y = AGECOL, fill = GROUP)) +
stat_summary(fun = mean, geom = "bar", position = position_dodge()) +
stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(width = 0.9), width = 0.3, alpha = 0.75) +
facet_wrap(~ SEX) +
labs(y = "Mean Age at Collection (years)") +
theme(axis.title.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.title = element_blank())

Plotting relationship between PRS and each outcome of interest
Gout vs PRS
all_cohorts %>%
filter(!is.na(GROUP)) %>%
ggplot(aes(x = GROUP, y = PRS, fill = GOUT2)) +
stat_summary(fun = mean, geom = "bar", position = position_dodge()) +
stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(width = 0.9), width = 0.3, alpha = 0.75) +
facet_wrap(~ SEX) +
labs(y = "Gout PRS") +
scale_fill_discrete(limits = c("Gout", "Control")) +
theme(axis.title.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.title = element_blank())

Serum urate vs PRS
all_cohorts %>%
filter(!is.na(URATE)) %>%
mutate(ULT2 = factor(case_when(!ULT ~ "Not on ULT",
ULT ~ "On ULT",
is.na(ULT) ~ "No Data / Control"),
levels = c("On ULT", "Not on ULT", "No Data / Control"))) %>%
ggplot(mapping = aes(x = PRS, y = URATE, color = GROUP)) +
geom_smooth(method = "lm", se = F) +
facet_wrap(~ SEX * ULT2) +
labs(x = "Gout PRS",
y = "Serum Urate (mg/dL)") +
theme(legend.title = element_blank())

ULT vs PRS
all_cohorts %>%
filter(GOUT) %>%
mutate(ULT2 = factor(case_when(!ULT ~ "Not on ULT",
ULT ~ "On ULT",
is.na(ULT) ~ "No Data"),
levels = c("On ULT", "Not on ULT", "No Data"))) %>%
ggplot(mapping = aes(x = ULT2, y = PRS, fill = GROUP)) +
stat_summary(fun = mean, geom = "bar", position = position_dodge()) +
stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(width = 0.9), width = 0.3, alpha = 0.75) +
facet_wrap(~ SEX) +
labs(y = "Gout PRS") +
theme(axis.title.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.title = element_blank())

Onset vs PRS
all_cohorts %>%
filter(!is.na(AGE1ATK)) %>%
ggplot(mapping = aes(x = PRS, y = AGE1ATK, color = COHORT2)) +
geom_smooth(method = "lm", se = F) +
facet_wrap(~ SEX) +
labs(x = "Gout PRS", y = "Age at Onset (years)") +
theme(legend.title = element_blank())

Duration vs PRS
all_cohorts %>%
filter(!is.na(DURATION)) %>%
ggplot(mapping = aes(x = PRS, y = DURATION, color = COHORT2)) +
geom_smooth(method = "lm", se = F) +
facet_wrap(~ SEX) +
labs(x = "Gout PRS", y = "Disease Duration (years)") +
theme(legend.title = element_blank())

Flares (all) vs PRS
all_cohorts %>%
filter(GOUT, !is.na(FLARE_CAT),
!str_detect(COHORT2, "Ardea")) %>%
ggplot(aes(x = FLARE_CAT, y = PRS, color = COHORT2)) +
stat_summary(geom = "point", fun = mean) +
stat_summary(geom = "errorbar", fun.data = mean_se, width = 0.3) +
stat_summary(geom = "line", fun = mean, aes(group = COHORT2)) +
facet_wrap(~ SEX) +
labs(x = "Number of Flares in Last Year (categorical)", y = "Gout PRS") +
theme(legend.title = element_blank())

Flares (> 1) vs PRS
all_cohorts %>%
filter(GOUT, !is.na(FLARE_CAT),
NUMATK >= 2) %>%
ggplot(aes(x = FLARE_CAT, y = PRS, color = COHORT2)) +
stat_summary(geom = "point", fun = mean) +
stat_summary(geom = "errorbar", fun.data = mean_se, width = 0.3) +
stat_summary(geom = "line", fun = mean, aes(group = COHORT2)) +
facet_wrap(~ SEX) +
labs(x = "Number of Flares in Last Year (categorical)", y = "Gout PRS") +
theme(legend.title = element_blank())

Tophi vs PRS
all_cohorts %>%
filter(GOUT,
!str_detect(COHORT2, "CRYSTAL")) %>%
mutate(TOPHIGOUT = factor(case_when(TOPHIGOUT ~ "Tophi",
!TOPHIGOUT ~ "No Tophi",
is.na(TOPHIGOUT) ~ "No Data"),
levels = c("Tophi", "No Tophi", "No Data"))) %>%
ggplot(mapping = aes(x = TOPHIGOUT, y = PRS, fill = GROUP)) +
stat_summary(fun = mean, geom = "bar", position = position_dodge()) +
stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(width = 0.9), width = 0.3, alpha = 0.75) +
facet_wrap(~ SEX) +
labs(y = "Gout PRS") +
theme(axis.title.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.title = element_blank())

Plotting relationship between PRS and age/sex
Age vs Genotyped PRS
all_cohorts %>%
ggplot(mapping = aes(x = AGECOL, y = PRS, color = GROUP)) +
geom_smooth(method = "lm", se = F) +
facet_wrap(~ SEX) +
labs(x = "Mean Age at Collection (years)",
y = "Gout PRS") +
theme(legend.title = element_blank())

Plotting relationship between severity traits
Onset vs Tophi
# Plotting relationships between severity traits
all_cohorts %>%
filter(!is.na(AGE1ATK),
!str_detect(COHORT2, "CRYSTAL")) %>%
mutate(TOPHIGOUT = factor(case_when(is.na(TOPHIGOUT) ~ "No Data",
TOPHIGOUT ~ "Tophi",
!TOPHIGOUT ~ "No Tophi"),
levels = c("Tophi", "No Tophi", "No Data"))) %>%
ggplot(aes(y = AGE1ATK, x = TOPHIGOUT, color = GROUP)) +
geom_boxplot() +
facet_wrap(~ SEX) +
labs(y = "Age at Onset (years)") +
theme(axis.title.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.title = element_blank())

Duration vs Tophi
all_cohorts %>%
filter(!is.na(DURATION),
!str_detect(COHORT2, "CRYSTAL")) %>%
mutate(TOPHIGOUT = factor(case_when(is.na(TOPHIGOUT) ~ "No Data",
TOPHIGOUT ~ "Tophi",
!TOPHIGOUT ~ "No Tophi"),
levels = c("Tophi", "No Tophi", "No Data"))) %>%
ggplot(aes(y = DURATION, x = TOPHIGOUT, color = GROUP)) +
geom_boxplot() +
facet_wrap(~ SEX) +
labs(y = "Disease Duration (years)") +
theme(axis.title.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.title = element_blank())

Flares (all) vs Tophi
all_cohorts %>%
filter(!is.na(NUMATK),
!str_detect(COHORT2, "CRYSTAL"),
!str_detect(COHORT2, "Ardea")) %>%
mutate(TOPHIGOUT = factor(case_when(is.na(TOPHIGOUT) ~ "No Data",
TOPHIGOUT ~ "Tophi",
!TOPHIGOUT ~ "No Tophi"),
levels = c("Tophi", "No Tophi", "No Data")),
NUMATK = case_when(NUMATK > 52 ~ 52,
TRUE ~ NUMATK)) %>%
ggplot(aes(y = NUMATK, x = TOPHIGOUT, color = GROUP)) +
geom_boxplot() +
facet_wrap(~ SEX) +
labs(y = "Number of Flares in Last Year") +
theme(axis.title.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.title = element_blank())

Flares (> 1) vs Tophi
all_cohorts %>%
filter(!is.na(NUMATK),
!str_detect(COHORT2, "CRYSTAL"),
NUMATK >= 2) %>%
mutate(TOPHIGOUT = factor(case_when(is.na(TOPHIGOUT) ~ "No Data",
TOPHIGOUT ~ "Tophi",
!TOPHIGOUT ~ "No Tophi"),
levels = c("Tophi", "No Tophi", "No Data")),
NUMATK = case_when(NUMATK > 52 ~ 52,
TRUE ~ NUMATK)) %>%
ggplot(aes(y = NUMATK, x = TOPHIGOUT, color = GROUP)) +
geom_boxplot() +
facet_wrap(~ SEX) +
labs(y = "Number of Flares in Last Year") +
theme(axis.title.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.title = element_blank())

Flares (all - cat) vs Tophi
all_cohorts %>%
filter(GOUT,
!str_detect(COHORT2, "CRYSTAL"),
!str_detect(COHORT2, "Ardea")) %>%
mutate(FLARE_CAT = factor(case_when(is.na(FLARE_CAT) ~ "No Data",
TRUE ~ as.character(FLARE_CAT)),
levels = rev(c(paste0(0:5),
"6 - 11",
"12 - 52",
"No Data")),
ordered = TRUE),
TOPHIGOUT = factor(case_when(is.na(TOPHIGOUT) ~ "No Data",
TOPHIGOUT ~ "Tophi",
!TOPHIGOUT ~ "No Tophi"),
levels = c("Tophi", "No Tophi", "No Data"))) %>%
group_by(GROUP, TOPHIGOUT, FLARE_CAT, SEX) %>%
summarize(value = n()) %>%
ggplot(aes(fill = FLARE_CAT, y = value, x = TOPHIGOUT, color = TOPHIGOUT)) +
geom_bar(position = "fill", stat = "identity") +
facet_wrap(~ SEX * GROUP) +
scale_fill_discrete(type = c("#C0C0C0", "#FDE725FF", "#9FDA3AFF", "#4AC16DFF", "#1FA187FF", "#277F8EFF", "#365C8DFF", "#46337EFF", "#440154FF")) +
theme(axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.title = element_blank())

Flares (> 1 - cat) vs Tophi
all_cohorts %>%
filter(GOUT,
!str_detect(COHORT2, "CRYSTAL"),
NUMATK >= 2) %>%
mutate(FLARE_CAT = factor(case_when(is.na(FLARE_CAT) ~ "No Data",
TRUE ~ as.character(FLARE_CAT)),
levels = rev(c(paste0(0:5),
"6 - 11",
"12 - 52",
"No Data")),
ordered = TRUE),
TOPHIGOUT = factor(case_when(is.na(TOPHIGOUT) ~ "No Data",
TOPHIGOUT ~ "Tophi",
!TOPHIGOUT ~ "No Tophi"),
levels = c("Tophi", "No Tophi", "No Data"))) %>%
group_by(GROUP, TOPHIGOUT, FLARE_CAT, SEX) %>%
summarize(value = n()) %>%
ggplot(aes(fill = FLARE_CAT, y = value, x = TOPHIGOUT, color = TOPHIGOUT)) +
geom_bar(position = "fill", stat = "identity") +
facet_wrap(~ SEX * GROUP) +
scale_fill_discrete(type = c("#C0C0C0", "#FDE725FF", "#9FDA3AFF", "#4AC16DFF", "#1FA187FF", "#277F8EFF", "#365C8DFF", "#46337EFF", "#440154FF")) +
theme(axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.title = element_blank())

Onset vs Flares (all)
all_cohorts %>%
filter(!is.na(AGE1ATK),
!is.na(NUMATK),
!str_detect(COHORT2, "Ardea")) %>%
ggplot(aes(x = AGE1ATK, y = NUMATK, color = GROUP)) +
geom_smooth(method = "lm", se = F) +
facet_wrap(~ SEX) +
labs(x = "Age at Onset (years)",
y = "Number of Flares in Last Year") +
theme(legend.title = element_blank())

Onset vs Flares (> 1)
all_cohorts %>%
filter(!is.na(AGE1ATK),
!is.na(NUMATK),
NUMATK >= 2) %>%
ggplot(aes(x = AGE1ATK, y = NUMATK, color = GROUP)) +
geom_smooth(method = "lm", se = F) +
facet_wrap(~ SEX) +
labs(x = "Age at Onset (years)",
y = "Number of Flares in Last Year") +
theme(legend.title = element_blank())

Duration vs Flares (all)
all_cohorts %>%
filter(!is.na(DURATION),
!is.na(NUMATK),
!str_detect(COHORT2, "Ardea")) %>%
ggplot(aes(x = DURATION, y = NUMATK, color = GROUP)) +
geom_smooth(method = "lm", se = F) +
facet_wrap(~ SEX) +
labs(x = "Disease Duration (years)",
y = "Number of Flares in Last Year") +
theme(legend.title = element_blank())

Duration vs Flares (> 1)
all_cohorts %>%
filter(!is.na(DURATION),
!is.na(NUMATK),
NUMATK >= 2) %>%
ggplot(aes(x = DURATION, y = NUMATK, color = GROUP)) +
geom_smooth(method = "lm", se = F) +
facet_wrap(~ SEX) +
labs(x = "Disease Duration (years)",
y = "Number of Flares in Last Year") +
theme(legend.title = element_blank())

Onset vs Flares (all - cat)
all_cohorts %>%
filter(GOUT, !is.na(FLARE_CAT),
!str_detect(COHORT2, "Ardea")) %>%
ggplot(aes(x = FLARE_CAT, y = AGE1ATK, color = COHORT2)) +
stat_summary(geom = "point", fun = mean) +
stat_summary(geom = "errorbar", fun.data = mean_se, width = 0.3) +
stat_summary(geom = "line", fun = mean, aes(group = COHORT2)) +
facet_wrap(~ SEX) +
labs(x = "Number of Flares in Last Year (categorical)", y = "Age at Onset (years)") +
theme(legend.title = element_blank())

Onset vs Flares (> 1 - cat)
all_cohorts %>%
filter(GOUT, !is.na(FLARE_CAT),
NUMATK >= 2) %>%
ggplot(aes(x = FLARE_CAT, y = AGE1ATK, color = COHORT2)) +
stat_summary(geom = "point", fun = mean) +
stat_summary(geom = "errorbar", fun.data = mean_se, width = 0.3) +
stat_summary(geom = "line", fun = mean, aes(group = COHORT2)) +
facet_wrap(~ SEX) +
labs(x = "Number of Flares in Last Year (categorical)", y = "Age at Onset (years)") +
theme(legend.title = element_blank())

Duration vs Flares (all - cat)
all_cohorts %>%
filter(GOUT, !is.na(FLARE_CAT),
!str_detect(COHORT2, "Ardea")) %>%
ggplot(aes(x = FLARE_CAT, y = DURATION, color = COHORT2)) +
stat_summary(geom = "point", fun = mean) +
stat_summary(geom = "errorbar", fun.data = mean_se, width = 0.3) +
stat_summary(geom = "line", fun = mean, aes(group = COHORT2)) +
facet_wrap(~ SEX) +
labs(x = "Number of Flares in Last Year (categorical)", y = "Disease Duration (years)") +
theme(legend.title = element_blank())

Duration vs Flares (> 1 - cat)
all_cohorts %>%
filter(GOUT, !is.na(FLARE_CAT),
NUMATK >= 2) %>%
ggplot(aes(x = FLARE_CAT, y = DURATION, color = COHORT2)) +
stat_summary(geom = "point", fun = mean) +
stat_summary(geom = "errorbar", fun.data = mean_se, width = 0.3) +
stat_summary(geom = "line", fun = mean, aes(group = COHORT2)) +
facet_wrap(~ SEX) +
labs(x = "Number of Flares in Last Year (categorical)", y = "Disease Duration (years)") +
theme(legend.title = element_blank())

Table of MAF for each SNP in each cohort
# Making a table with SNPs as rows and columns representing MAF for each cohort for that SNP
SNPlist <- UKBB_Gene_OR$RSID
get_maf <- function(cohort, snps){
test <- cohort %>%
select(all_of(snps)) %>%
mutate_all(factor, levels = 0:2) %>%
na.omit()
tmp <- c()
for(i in 1:ncol(test)){
tmp[i] <- ((sum(test[[i]] == 1) + (sum(test[[i]] == 2) * 2)) / (2 * nrow(test))) %>% sprintf(fmt = "%#.3f")
}
return(tmp)
}
freq_table <- tibble("Cohort" = rep(cohortstring, each = length(SNPlist)),
"MAF" = unlist(lapply(data_list2, function(x) get_maf(cohort = x, snps = SNPlist))),
"SNP" = rep(SNPlist, length(cohortstring))) %>%
pivot_wider(names_from = SNP, values_from = MAF)
freq_table <- transpose_df(freq_table) %>%
column_to_rownames(var = "Cohort") %>%
mutate(across(.cols = 1:ncol(freq_table), ~ str_replace(string = .x, pattern = " ", replacement = " ")))
row.names(freq_table) <- str_replace(row.names(freq_table), " ", " ")
freq_table %>%
kable(col.names = clean_names,
align = "c",
escape = F) %>%
kable_styling("striped") %>%
scroll_box(width = "900px", height = "475px")
|
|
UK Biobank Gout Male
|
UK Biobank Gout Female
|
UK Biobank Control Male
|
UK Biobank Control Female
|
Aus/NZ European Gout Male
|
Aus/NZ European Gout Female
|
Aus/NZ European Control Male
|
Aus/NZ European Control Female
|
GlobalGout Gout Male
|
GlobalGout Gout Female
|
GlobalGout Control Male
|
GlobalGout Control Female
|
Ardea LASSO Gout Male
|
Ardea LASSO Gout Female
|
Ardea CLEAR1 Gout Male
|
Ardea CLEAR1 Gout Female
|
Ardea CLEAR2 Gout Male
|
Ardea CLEAR2 Gout Female
|
Ardea CRYSTAL Gout Male
|
Ardea CRYSTAL Gout Female
|
Ardea LIGHT Gout Male
|
Ardea LIGHT Gout Female
|
East Polynesian Gout Male
|
East Polynesian Gout Female
|
East Polynesian Control Male
|
East Polynesian Control Female
|
East Polynesian Gout Male NP
|
East Polynesian Gout Female NP
|
East Polynesian Control Male NP
|
East Polynesian Control Female NP
|
West Polynesian Gout Male
|
West Polynesian Gout Female
|
West Polynesian Control Male
|
West Polynesian Control Female
|
|
rs10910845
|
0.495
|
0.475
|
0.462
|
0.461
|
0.491
|
0.460
|
0.473
|
0.477
|
0.496
|
0.496
|
0.591
|
0.481
|
0.537
|
0.508
|
0.504
|
0.344
|
0.531
|
0.357
|
0.520
|
0.375
|
0.560
|
0.625
|
0.686
|
0.721
|
0.676
|
0.632
|
0.673
|
0.750
|
0.705
|
0.676
|
0.704
|
0.657
|
0.653
|
0.636
|
|
rs11264341
|
0.597
|
0.597
|
0.571
|
0.569
|
0.602
|
0.560
|
0.554
|
0.542
|
0.609
|
0.569
|
0.568
|
0.608
|
0.587
|
0.631
|
0.585
|
0.656
|
0.609
|
0.714
|
0.649
|
0.625
|
0.624
|
0.562
|
0.450
|
0.443
|
0.432
|
0.461
|
0.440
|
0.536
|
0.455
|
0.419
|
0.396
|
0.537
|
0.402
|
0.353
|
|
rs1260326
|
0.437
|
0.429
|
0.391
|
0.392
|
0.449
|
0.486
|
0.396
|
0.389
|
0.465
|
0.387
|
0.545
|
0.513
|
0.449
|
0.485
|
0.476
|
0.531
|
0.506
|
0.643
|
0.471
|
0.375
|
0.477
|
0.438
|
0.346
|
0.361
|
0.303
|
0.277
|
0.278
|
0.339
|
0.330
|
0.378
|
0.354
|
0.315
|
0.249
|
0.297
|
|
rs9847710
|
0.453
|
0.443
|
0.424
|
0.424
|
0.453
|
0.440
|
0.429
|
0.414
|
0.448
|
0.484
|
0.386
|
0.386
|
0.457
|
0.438
|
0.493
|
0.438
|
0.406
|
0.500
|
0.434
|
0.375
|
0.445
|
0.688
|
0.462
|
0.496
|
0.432
|
0.473
|
0.431
|
0.464
|
0.443
|
0.446
|
0.654
|
0.602
|
0.643
|
0.639
|
|
rs7675964
|
0.807
|
0.825
|
0.723
|
0.726
|
0.805
|
0.838
|
0.728
|
0.721
|
0.800
|
0.798
|
0.693
|
0.753
|
0.805
|
0.808
|
0.839
|
0.781
|
0.810
|
0.929
|
0.829
|
1.000
|
0.780
|
0.938
|
0.779
|
0.795
|
0.778
|
0.741
|
0.770
|
0.839
|
0.716
|
0.770
|
0.578
|
0.667
|
0.530
|
0.519
|
|
rs4481233
|
0.877
|
0.894
|
0.809
|
0.811
|
0.882
|
0.895
|
0.815
|
0.805
|
0.869
|
0.887
|
0.830
|
0.861
|
0.882
|
0.908
|
0.911
|
0.875
|
0.893
|
0.929
|
0.903
|
1.000
|
0.872
|
0.938
|
0.971
|
0.975
|
0.961
|
0.958
|
0.956
|
0.982
|
0.943
|
0.905
|
0.976
|
0.972
|
0.975
|
0.947
|
|
rs6811287
|
0.614
|
0.612
|
0.549
|
0.551
|
0.613
|
0.650
|
0.543
|
0.557
|
0.605
|
0.649
|
0.568
|
0.608
|
0.643
|
0.585
|
0.641
|
0.656
|
0.613
|
0.643
|
0.637
|
0.625
|
0.601
|
0.625
|
0.607
|
0.652
|
0.564
|
0.598
|
0.573
|
0.679
|
0.625
|
0.446
|
0.689
|
0.657
|
0.749
|
0.681
|
|
rs2231142
|
0.199
|
0.156
|
0.110
|
0.113
|
0.216
|
0.188
|
0.110
|
0.117
|
0.191
|
0.165
|
0.114
|
0.089
|
0.228
|
0.169
|
0.246
|
0.281
|
0.226
|
0.214
|
0.283
|
0.125
|
0.193
|
0.000
|
0.115
|
0.074
|
0.066
|
0.072
|
0.089
|
0.071
|
0.057
|
0.041
|
0.462
|
0.426
|
0.236
|
0.231
|
|
rs10011796
|
0.532
|
0.513
|
0.458
|
0.460
|
0.538
|
0.512
|
0.457
|
0.468
|
0.529
|
0.516
|
0.534
|
0.456
|
0.555
|
0.608
|
0.537
|
0.625
|
0.596
|
0.714
|
0.560
|
0.500
|
0.583
|
0.438
|
0.599
|
0.635
|
0.552
|
0.555
|
0.548
|
0.536
|
0.534
|
0.541
|
0.678
|
0.630
|
0.613
|
0.594
|
|
rs1165196
|
0.607
|
0.604
|
0.566
|
0.569
|
0.605
|
0.638
|
0.580
|
0.579
|
0.628
|
0.593
|
0.557
|
0.475
|
0.649
|
0.600
|
0.602
|
0.656
|
0.617
|
0.714
|
0.606
|
0.750
|
0.610
|
0.688
|
0.734
|
0.713
|
0.695
|
0.707
|
0.669
|
0.714
|
0.727
|
0.595
|
0.763
|
0.759
|
0.704
|
0.700
|
|
rs853685
|
0.178
|
0.176
|
0.165
|
0.165
|
0.170
|
0.145
|
0.154
|
0.162
|
0.159
|
0.185
|
0.136
|
0.101
|
0.145
|
0.185
|
0.139
|
0.188
|
0.178
|
0.143
|
0.166
|
0.125
|
0.142
|
0.125
|
0.091
|
0.123
|
0.108
|
0.089
|
0.097
|
0.161
|
0.125
|
0.135
|
0.173
|
0.130
|
0.123
|
0.117
|
|
rs3812316
|
0.893
|
0.892
|
0.871
|
0.871
|
0.896
|
0.881
|
0.875
|
0.867
|
0.890
|
0.879
|
0.920
|
0.880
|
0.890
|
0.954
|
0.911
|
0.875
|
0.895
|
0.929
|
0.914
|
1.000
|
0.913
|
1.000
|
0.974
|
0.984
|
0.967
|
0.958
|
0.952
|
0.946
|
0.989
|
0.973
|
0.990
|
0.991
|
0.982
|
0.978
|
|
rs1171616
|
0.797
|
0.799
|
0.768
|
0.768
|
0.791
|
0.788
|
0.776
|
0.779
|
0.795
|
0.786
|
0.761
|
0.804
|
0.808
|
0.792
|
0.809
|
0.844
|
0.833
|
0.714
|
0.800
|
0.625
|
0.844
|
0.812
|
0.949
|
0.959
|
0.936
|
0.949
|
0.935
|
0.946
|
0.943
|
0.878
|
0.986
|
0.963
|
0.977
|
0.975
|
|
rs17300741
|
0.495
|
0.493
|
0.449
|
0.451
|
0.491
|
0.405
|
0.489
|
0.465
|
0.528
|
0.560
|
0.466
|
0.582
|
0.507
|
0.585
|
0.509
|
0.469
|
0.533
|
0.500
|
0.520
|
0.500
|
0.518
|
0.750
|
0.797
|
0.807
|
0.793
|
0.771
|
0.766
|
0.768
|
0.807
|
0.730
|
0.818
|
0.778
|
0.859
|
0.861
|
|
rs7937990
|
0.210
|
0.188
|
0.185
|
0.187
|
0.201
|
0.195
|
0.181
|
0.205
|
0.236
|
0.222
|
0.205
|
0.158
|
0.225
|
0.192
|
0.222
|
0.375
|
0.285
|
0.143
|
0.243
|
0.000
|
0.248
|
0.250
|
0.456
|
0.447
|
0.461
|
0.408
|
0.403
|
0.500
|
0.511
|
0.378
|
0.452
|
0.454
|
0.369
|
0.394
|
|
rs4014195
|
0.368
|
0.360
|
0.342
|
0.343
|
0.383
|
0.350
|
0.338
|
0.347
|
0.382
|
0.367
|
0.398
|
0.386
|
0.370
|
0.354
|
0.367
|
0.406
|
0.412
|
0.571
|
0.369
|
0.250
|
0.376
|
0.500
|
0.420
|
0.352
|
0.402
|
0.398
|
0.351
|
0.375
|
0.386
|
0.378
|
0.252
|
0.269
|
0.201
|
0.256
|
|
rs1106766
|
0.784
|
0.778
|
0.757
|
0.757
|
0.799
|
0.798
|
0.764
|
0.767
|
0.786
|
0.722
|
0.852
|
0.797
|
0.805
|
0.823
|
0.850
|
0.844
|
0.816
|
0.857
|
0.803
|
0.875
|
0.821
|
0.812
|
0.928
|
0.947
|
0.934
|
0.933
|
0.919
|
0.982
|
0.966
|
0.946
|
0.978
|
0.981
|
0.962
|
0.975
|
|
rs28652632
|
0.546
|
0.533
|
0.521
|
0.523
|
0.517
|
0.545
|
0.531
|
0.537
|
0.528
|
0.512
|
0.523
|
0.500
|
0.536
|
0.515
|
0.498
|
0.500
|
0.538
|
0.214
|
0.523
|
0.625
|
0.578
|
0.562
|
0.681
|
0.668
|
0.660
|
0.666
|
0.738
|
0.696
|
0.625
|
0.797
|
0.592
|
0.574
|
0.550
|
0.567
|
|
rs738409
|
0.799
|
0.804
|
0.783
|
0.783
|
0.801
|
0.800
|
0.777
|
0.796
|
0.780
|
0.802
|
0.761
|
0.772
|
0.780
|
0.785
|
0.750
|
0.750
|
0.818
|
0.857
|
0.780
|
0.750
|
0.775
|
0.875
|
0.748
|
0.816
|
0.759
|
0.791
|
0.802
|
0.839
|
0.784
|
0.797
|
0.739
|
0.806
|
0.754
|
0.758
|
# rs35332062 is monomorphic in females in western polynesians so it should be removed from the individual SNP analysis, though it can be left in the PRS
Running all models of interest
The purpose of this section is to run all models of interest for assessing the relationship between gout genetic risk and severity of gout. This includes modeling gout vs the PRS, and modeling age at onset, tophi, and flare frequency vs the PRS. For now, I have decided to not run flare frequency models and to just display the plots while discussing how complex the flare phenotype is.
# Datasets
load(here("Output/all_pheno_prs.RData"))
# Making FLARE_CAT variable and setting all control gout severity traits to NA
all_pheno_prs <- all_pheno_prs %>%
mutate(FLARE_CAT = factor(case_when(between(NUMATK, 0, 5) ~ paste0(as.character(NUMATK), " flares in last year"),
between(NUMATK, 6, 11) ~ "One every one to two months",
between(NUMATK, 12, 52) ~ "One or more per month"),
levels = c(paste0(0:5, " flares in last year"),
"One every one to two months",
"One or more per month"),
labels = c(paste0(0:5),
"6 - 11",
"12 - 52"),
ordered = TRUE),
AGE1ATK = case_when(GOUT ~ AGE1ATK),
DURATION = case_when(GOUT ~ DURATION),
NUMATK = case_when(GOUT ~ NUMATK),
TOPHIGOUT = case_when(GOUT ~ TOPHIGOUT),
ULT = case_when(GOUT ~ ULT),
SEX = factor(SEX, levels = c("Male", "Female")),
GOUT2 = factor(GOUT, levels = c(TRUE, FALSE), labels = c("Gout", "Control")),
GROUP = factor(case_when(GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "European Gout",
!GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "European Control",
GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study != "Ngati Porou" ~ "East Polynesian Gout",
!GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study != "Ngati Porou" ~ "East Polynesian Control",
GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study == "Ngati Porou" ~ "East Polynesian Gout - NP",
!GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study == "Ngati Porou" ~ "East Polynesian Control - NP",
GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian Gout",
!GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian Control"),
levels = c("European Gout", "European Control", "East Polynesian Gout", "East Polynesian Control", "East Polynesian Gout - NP", "East Polynesian Control - NP", "West Polynesian Gout", "West Polynesian Control")),
GROUP2 = factor(case_when(GROUP == "European Gout" & SEX == "Male" ~ "European Gout - male",
GROUP == "European Gout" & SEX == "Female" ~ "European Gout - female",
GROUP == "European Control" & SEX == "Male" ~ "European Control - male",
GROUP == "European Control" & SEX == "Female" ~ "European Control - female",
GROUP == "East Polynesian Gout" & SEX == "Male" ~ "East Polynesian Gout - male",
GROUP == "East Polynesian Gout" & SEX == "Female" ~ "East Polynesian Gout - female",
GROUP == "East Polynesian Control" & SEX == "Male" ~ "East Polynesian Control - male",
GROUP == "East Polynesian Control" & SEX == "Female" ~ "East Polynesian Control - female",
GROUP == "East Polynesian Gout - NP" & SEX == "Male" ~ "East Polynesian Gout - NP - male",
GROUP == "East Polynesian Gout - NP" & SEX == "Female" ~ "East Polynesian Gout - NP - female",
GROUP == "East Polynesian Control - NP" & SEX == "Male" ~ "East Polynesian Control - NP - male",
GROUP == "East Polynesian Control - NP" & SEX == "Female" ~ "East Polynesian Control - NP - female",
GROUP == "West Polynesian Gout" & SEX == "Male" ~ "West Polynesian Gout - male",
GROUP == "West Polynesian Gout" & SEX == "Female" ~ "West Polynesian Gout - female",
GROUP == "West Polynesian Control" & SEX == "Male" ~ "West Polynesian Control - male",
GROUP == "West Polynesian Control" & SEX == "Female" ~ "West Polynesian Control - female"),
levels = c("European Gout - male", "European Gout - female", "European Control - male", "European Control - female", "East Polynesian Gout - male", "East Polynesian Gout - female", "East Polynesian Control - male", "East Polynesian Control - female", "East Polynesian Gout - NP - male", "East Polynesian Gout - NP - female", "East Polynesian Control - NP - male", "East Polynesian Control - NP - female", "West Polynesian Gout - male", "West Polynesian Gout - female", "West Polynesian Control - male", "West Polynesian Control - female")),
GROUP3 = factor(case_when(GOUT & SEX == "Male" ~ "Male Gout",
GOUT & SEX == "Female" ~ "Female Gout",
!GOUT & SEX == "Male" ~ "Male Control",
!GOUT & SEX == "Female" ~ "Female Control"),
levels = c("Male Gout", "Female Gout", "Male Control", "Female Control")),
COHORT2 = factor(case_when(GOUT & Pheno.Study == "UK Biobank" ~ "UK Biobank - Gout",
!GOUT & Pheno.Study == "UK Biobank" ~ "UK Biobank - Control",
GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") & Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease") ~ "Aus/NZ - Gout",
!GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") & Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease") ~ "Aus/NZ - Control",
GOUT & Pheno.Study == "EuroGout" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "GlobalGout - Gout",
!GOUT & Pheno.Study == "EuroGout" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "GlobalGout - Control",
Pheno.Study == "Ardea: 401" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - LASSO",
Pheno.Study == "Ardea: CLEAR1" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - CLEAR1",
Pheno.Study == "Ardea: CLEAR2" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - CLEAR2",
Pheno.Study == "Ardea: CRYSTAL" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - CRYSTAL",
Pheno.Study == "Ardea: LIGHT" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - LIGHT",
GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study != "Ngati Porou" ~ "East Polynesian - Gout",
!GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study != "Ngati Porou" ~ "East Polynesian - Control",
GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study == "Ngati Porou" ~ "East Polynesian - Gout - NP",
!GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study == "Ngati Porou" ~ "East Polynesian - Control - NP",
GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian - Gout",
!GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian - Control"),
levels = c("UK Biobank - Gout", "UK Biobank - Control", "Aus/NZ - Gout", "Aus/NZ - Control", "GlobalGout - Gout", "GlobalGout - Control", "Ardea - LASSO", "Ardea - CLEAR1", "Ardea - CLEAR2", "Ardea - CRYSTAL", "Ardea - LIGHT", "East Polynesian - Gout", "East Polynesian - Control", "East Polynesian - Gout - NP", "East Polynesian - Control - NP", "West Polynesian - Gout", "West Polynesian - Control")),
NUMATK = round(NUMATK)) %>%
filter(!is.na(AGECOL),
!is.na(PRS),
(Pheno.Study == "UK Biobank" | !GOUT | GOUT & !(is.na(AGE1ATK) & is.na(NUMATK) & is.na(TOPHIGOUT))),
!is.na(COHORT2))
all_pheno_prs_male <- all_pheno_prs %>%
filter(SEX == "Male")
all_pheno_prs_female <- all_pheno_prs %>%
filter(SEX == "Female")
data_list <- list("UK Biobank - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
Pheno.Study == "UK Biobank"),
"UK Biobank - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
Pheno.Study == "UK Biobank"),
"UK Biobank - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
Pheno.Study == "UK Biobank"),
"UK Biobank - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
Pheno.Study == "UK Biobank"),
"Aus/NZ European - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
"Aus/NZ European - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
"Aus/NZ European - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
"Aus/NZ European - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
"GlobalGout - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
Pheno.Study == "EuroGout",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"GlobalGout - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
Pheno.Study == "EuroGout",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"GlobalGout - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
Pheno.Study == "EuroGout",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"GlobalGout - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
Pheno.Study == "EuroGout",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - LASSO - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: 401",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - LASSO - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: 401",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - CLEAR1 - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: CLEAR1",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - CLEAR1 - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: CLEAR1",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - CLEAR2 - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: CLEAR2",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - CLEAR2 - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: CLEAR2",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - CRYSTAL - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: CRYSTAL",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - CRYSTAL - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: CRYSTAL",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - LIGHT - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: LIGHT",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - LIGHT - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: LIGHT",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"East Polynesian - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study != "Ngati Porou"),
"East Polynesian - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study != "Ngati Porou"),
"East Polynesian - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study != "Ngati Porou"),
"East Polynesian - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study != "Ngati Porou"),
"East Polynesian - Gout - NP - Male" = all_pheno_prs_male %>% filter(GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study == "Ngati Porou"),
"East Polynesian - Gout - NP - Female" = all_pheno_prs_female %>% filter(GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study == "Ngati Porou"),
"East Polynesian - Control - NP - Male" = all_pheno_prs_male %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study == "Ngati Porou"),
"East Polynesian - Control - NP - Female" = all_pheno_prs_female %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study == "Ngati Porou"),
"West Polynesian - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
"West Polynesian - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
"West Polynesian - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
"West Polynesian - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")))
load(here("Output/UKBB_Gene_OR.RData"))
load(here("Output/Tin_Gene_OR.RData"))
OR <- function(x, Predictor) {
sprintf(exp(coef(x))[[Predictor]], fmt = "%#.4f")
}
LCL_OR <- function(x, Predictor) {
sprintf(exp(confint.default(x))[Predictor, 1], fmt = "%#.4f")
}
UCL_OR <- function(x, Predictor) {
sprintf(exp(confint.default(x))[Predictor, 2], fmt = "%#.4f")
}
Pval <- function(x, Predictor) {
signif(summary(x)$coefficients[Predictor, 4], 3)
}
Beta <- function(x, Predictor) {
sprintf(coef(x)[[Predictor]], fmt = "%#.4f")
}
LCL <- function(x, Predictor) {
sprintf(confint.default(x)[Predictor, 1], fmt = "%#.4f")
}
UCL <- function(x, Predictor) {
sprintf(confint.default(x)[Predictor, 2], fmt = "%#.4f")
}
First, I want to test the assumptions of the models of interest. For linear regression, the assumptions are:
- Linear relationship between variables
- Normality of residuals
- Homoscedasticity of residuals
- No multicollinearity (for multiple regression)
Logistic regression only relies on no multicollinearity.
All assumptions should be tested for all models, but I will just test some representative models instead - the main thing is that extremely non-normal variables should not be used for linear regression. This should be tested for the age at onset variable, and the number of flares per year variable.
# Age at onset -------------------------------------------------------------------------------------
# Representative linear model of age at onset vs the PRS in the combined male gout cohort
tmp <- all_pheno_prs %>%
filter(SEX == "Male",
GOUT,
Geno.SpecificAncestry %in% c("European", "Iberian", "European; Iberian"))
ggplot(tmp, aes(x = PRS, y = AGE1ATK, color = COHORT2)) +
geom_point(shape = 1) +
geom_smooth(se = FALSE) # No evidence of non-linearity
mod <- lm(AGE1ATK ~ PRS + Geno.PCVector1 + Geno.PCVector2 + Geno.PCVector3 + Geno.PCVector4 + Geno.PCVector5 + Geno.PCVector6 + Geno.PCVector7 + Geno.PCVector8 + Geno.PCVector9 + Geno.PCVector10, data = tmp)
test <- augment(mod)
ggplot(test, aes(x = .fitted, y = .resid)) +
geom_point() # no obvious pattern, so homoscedasticity seems to be met
ggplot(data = test, mapping = aes(x = .resid)) +
geom_histogram(mapping = aes(y = ..density..), bins = 30, fill = 'gray', color = 'black') +
stat_function(fun = 'dnorm',
args = list(mean = mean(test$.resid), sd = sd(test$.resid)),
color = 'red')
ggplot(data = test, mapping = aes(sample = .resid)) +
geom_qq() +
geom_qq_line()
ggplot(data = test, mapping = aes(x = .resid)) +
geom_boxplot() +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())
test %>%
pull(.resid) %>%
stat.desc(basic = FALSE, desc = FALSE, norm = TRUE) %>%
enframe() %>%
pivot_wider(names_from = name, values_from = value)
# Normality of residuals is completely met here (ignore 2SE and shapiro-wilk test as this is large data)
vif(mod) # no multicollinearity (no values over 5 or 10)
# So we should have no problem running models of the PRS vs age at onset (as long as we don't include variables that exhibit multicollinearity)
# Flares (linear) ----------------------------------------------------------------------------------------
# First let's run a representative linear model of NUMATK vs PRS in the combined male gout cohort
tmp <- tmp %>%
mutate(NUMATK = case_when(NUMATK > 52 ~ 52,
TRUE ~ NUMATK)) %>%
filter(NUMATK >= 2)
ggplot(tmp, aes(x = PRS, y = NUMATK, color = COHORT2)) +
geom_point(shape = 1) +
geom_smooth(se = FALSE)
mod <- lm(NUMATK ~ PRS + Geno.PCVector1 + Geno.PCVector2 + Geno.PCVector3 + Geno.PCVector4 + Geno.PCVector5 + Geno.PCVector6 + Geno.PCVector7 + Geno.PCVector8 + Geno.PCVector9 + Geno.PCVector10, data = tmp)
test <- augment(mod)
ggplot(test, aes(x = .fitted, y = .resid)) +
geom_point() # weird pattern, so homoscedasticity seems to not be met
ggplot(data = test, mapping = aes(x = .resid)) +
geom_histogram(mapping = aes(y = ..density..), bins = 30, fill = 'gray', color = 'black') +
stat_function(fun = 'dnorm',
args = list(mean = mean(test$.resid), sd = sd(test$.resid)),
color = 'red')
ggplot(data = test, mapping = aes(sample = .resid)) +
geom_qq() +
geom_qq_line()
ggplot(data = test, mapping = aes(x = .resid)) +
geom_boxplot() +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())
test %>%
pull(.resid) %>%
stat.desc(basic = FALSE, desc = FALSE, norm = TRUE) %>%
enframe() %>%
pivot_wider(names_from = name, values_from = value)
# Normality of residuals is not met here (ignore 2SE and shapiro-wilk test as this is large data)
vif(mod) # no multicollinearity (no values over 5 or 10)
# So it is not appropriate to use the NUMATK variable in a linear regression model
# Flares (ordinal) -----------------------------------------------------------------
# I will go ahead and test whether categorizing the variable (FLARE_CAT) and running an independent ANOVA or ordinal logistic regression model will be appropriate - note I am not sure how easy it will be to adjust for covariates in an ANOVA (perhaps I could run an ANCOVA but it might not be doing what I think it's doing)
# The ordinal logistic regression test should tell us the odds of being in any combination of higher flare categories vs the remainder of the categories (i.e. highest flare group vs all others, or highest 5 flare groups vs lowest flare group)
# I'll try running an example from a paper to see if I can replicate their results in R
# dat <- tibble("Group" = rep(c("Activator", "Headgear"), each = 50),
# "Happiness" = factor(c(rep("Unhappy", 30), rep("Somewhat happy", 14), rep("Very happy", 6), rep("Unhappy", 14), rep("Somewhat happy", 25), rep("Very happy", 11)), levels = c("Unhappy", "Somewhat happy", "Very happy"), ordered = T))
#
# mod <- MASS::polr(Happiness ~ Group, data = dat, Hess = TRUE)
#
# modsum <- tibble("Group of interest" = row.names(summary(mod)$coefficients)[1],
# "OR" = exp(summary(mod)$coefficients)[1],
# "Lower CI" = exp(confint.default(mod))[1],
# "Upper CI" = exp(confint.default(mod))[2],
# "P-value" = pnorm(abs(summary(mod)$coefficients[1, "t value"]), lower.tail = FALSE) * 2)
# To interpret the odds ratio, we can say that for the group of children that wore Headgear, the odds of being more happy (i.e. being in a happiness tier or higher compared to a lower tier) were 3.27 times those of the the children that used an Activator
# The parallel to this in our example would be that for those individuals with 1 unit more PRS than another group, the odds of being in a higher flare category (i.e. >= 3 flares vs < 3 flares) were X times those of the other group (i.e. those with 1 unit PRS lower - the reference group). Let's test this using an example model
mod <- polr(FLARE_CAT ~ PRS, data = tmp, Hess = TRUE)
modsum <- tibble("Group of interest" = row.names(summary(mod)$coefficients)[1],
"OR" = exp(summary(mod)$coefficients)[1],
"Lower CI" = exp(confint.default(mod))[1],
"Upper CI" = exp(confint.default(mod))[2],
"P-value" = pnorm(abs(summary(mod)$coefficients[1, "t value"]), lower.tail = FALSE) * 2)
# This suggests that the model is significant for the full male gout cohort, with an OR of 1.14 [1.03, 1.26], p = 0.0112 per PRS unit
# Can try to visualize this by looking at proportions of FLARE_CAT within bins of PRS
tmp %>%
filter(!is.na(PRS)) %>%
mutate(PRS_bin = factor(ntile(PRS, 6), ordered = T),
FLARE_CAT = factor(case_when(is.na(FLARE_CAT) ~ "No Data",
TRUE ~ as.character(FLARE_CAT)),
levels = rev(c(paste0(0:5),
"6 - 11",
"12 - 52",
"No Data")),
ordered = TRUE)) %>%
group_by(PRS_bin, FLARE_CAT, SEX, GROUP) %>%
summarize(value = n()) %>%
ggplot(aes(fill = FLARE_CAT, y = value, x = PRS_bin)) +
geom_bar(position = "fill", stat = "identity") +
facet_wrap(~ SEX * GROUP) +
scale_fill_discrete(type = c("#C0C0C0", "#FDE725FF", "#9FDA3AFF", "#4AC16DFF", "#1FA187FF", "#277F8EFF", "#365C8DFF", "#46337EFF", "#440154FF")) +
theme(axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.title = element_blank())
# Flares (ANOVA) --------------------------------------------------------------------------------------
# The ANOVA test will essentially ask "does the mean PRS differ between flare categories"
# I should further note that I am unclear on how to meta-analyze ANOVA results - perhaps just running them within each cohort and determining how many show significant differences and, of those, which groups were different and in which direction? The other option is to pull cohorts but that runs into potentially weird issues of bias. So I think in the end I should run both models and see if I get a different answer, then decide how to present them in the paper.
# The assumptions of a one-way independent ANOVA are:
# 1. Independence of the datapoints (met)
# 2. Normality of model residuals within groups
# 3. Homogeneity of variance of model residuals within groups
mod <- tmp %>%
mutate(IID = factor(IID),
FLARE_CAT = factor(FLARE_CAT)) %>%
ezANOVA(data = .,
dv = PRS,
between = FLARE_CAT,
wid = IID,
type = 3,
return_aov = TRUE)
mod
tmp %>%
ggplot(aes(x = FLARE_CAT, y = PRS)) +
geom_boxplot()
rm(tmp, mod, modsum, test)
Based on the above, it should be appropriate to run linear regression models on age at onset, but not flare frequency. For flare frequency, I could use ordinal logistic regression after categorizing flares into roughly even groups (this regression assumes that the effect between two levels of the variable are consistent across levels of the variable). I could also use ANOVA but this may prove difficult to adjust for covariates. For now I won’t run any models for this variable.
# We need to model the PRS, the PRS without ABCG2, and both ABCG2 SNPs against gout, adjusting for global PCs. It needs to be in males and females separately, and needs to be both adjusted for age at collection and unadjusted. It also needs to run with the 10 oceanian PCs for Polynesian cohorts. We want the output of all of these models to be stored in a list object which we can then extract all of the important elements from in a table format.
gout_data_list <- list("UK Biobank - Male" = full_join(data_list[["UK Biobank - Gout - Male"]],
data_list[["UK Biobank - Control - Male"]]),
"UK Biobank - Female" = full_join(data_list[["UK Biobank - Gout - Female"]],
data_list[["UK Biobank - Control - Female"]]),
"Aus/NZ European - Male" = full_join(data_list[["Aus/NZ European - Gout - Male"]],
data_list[["Aus/NZ European - Control - Male"]]),
"Aus/NZ European - Female" = full_join(data_list[["Aus/NZ European - Gout - Female"]],
data_list[["Aus/NZ European - Control - Female"]]),
"East Polynesian - Male" = full_join(data_list[["East Polynesian - Gout - Male"]],
data_list[["East Polynesian - Control - Male"]]),
"East Polynesian - Female" = full_join(data_list[["East Polynesian - Gout - Female"]],
data_list[["East Polynesian - Control - Female"]]),
"East Polynesian - NP - Male" = full_join(data_list[["East Polynesian - Gout - NP - Male"]],
data_list[["East Polynesian - Control - NP - Male"]]),
"East Polynesian - NP - Female" = full_join(data_list[["East Polynesian - Gout - NP - Female"]],
data_list[["East Polynesian - Control - NP - Female"]]),
"West Polynesian - Male" = full_join(data_list[["West Polynesian - Gout - Male"]],
data_list[["West Polynesian - Control - Male"]]),
"West Polynesian - Female" = full_join(data_list[["West Polynesian - Gout - Female"]],
data_list[["West Polynesian - Control - Female"]]))
for(i in length(gout_data_list):1){
if(nrow(gout_data_list[[i]]) < 20){
gout_data_list[[i]] <- NULL
}
}
# I want to model the following for every cohort: PRS, PRS less ABCG2, both ABCG2 variants simultaneously; all models should be run with and without AGECOL as a covariate
modlist <- vector("list", length(gout_data_list))
for(i in seq_along(gout_data_list)){
prsnames <- list("PRS", "PRS_noABCG2", c("rs2231142", "rs10011796"), "Urate_PRS", "Urate_PRS_noABCG2")
covariates <- c()
if(!str_detect(names(gout_data_list)[i], "UK Biobank")){
covariates <- c(covariates, "Geno.PCVector1", "Geno.PCVector2", "Geno.PCVector3", "Geno.PCVector4", "Geno.PCVector5", "Geno.PCVector6", "Geno.PCVector7", "Geno.PCVector8", "Geno.PCVector9", "Geno.PCVector10")
}
if(str_detect(names(gout_data_list)[i], "Polynesian")){
covariates <- c(covariates, "Geno.PCVector1_Oc", "Geno.PCVector2_Oc", "Geno.PCVector3_Oc", "Geno.PCVector4_Oc", "Geno.PCVector5_Oc", "Geno.PCVector6_Oc", "Geno.PCVector7_Oc", "Geno.PCVector8_Oc", "Geno.PCVector9_Oc", "Geno.PCVector10_Oc")
}
covariates2 <- c(covariates, "AGECOL")
tmplist <- vector("list", 2 * length(prsnames))
for(j in seq_along(prsnames)) {
variables <- c(prsnames[[j]], covariates)
f <- as.formula(paste("GOUT", paste(variables, collapse = " + "), sep = " ~ "))
assign(paste0("Model_", i, "_", j), glm(f, family = binomial, data = gout_data_list[[i]]))
# columns will be: Cohort, N, N case, N control, Outcome, Predictors, Predictor, Covariates, OR, LCL, UCL, Pval, log-odds, standard error
modstring1 <- c(names(gout_data_list)[[i]],
nrow(gout_data_list[[i]]),
nrow(gout_data_list[[i]] %>% filter(GOUT)),
nrow(gout_data_list[[i]] %>% filter(!GOUT)),
"Gout")
modstring <- list()
if(any(str_detect(prsnames[[j]], "PRS"))) {
modstring[[1]] <- c(modstring1,
paste(prsnames[[j]], collapse = " + "),
paste(prsnames[[j]], collapse = " + "),
paste(covariates, collapse = " + "),
OR(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
LCL_OR(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
UCL_OR(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
Pval(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
coef(get(paste0("Model_", i, "_", j)))[[prsnames[[j]]]],
summary(get(paste0("Model_", i, "_", j)))$coefficients[prsnames[[j]], 2])
} else{
for(k in 1:length(prsnames[[j]])){
modstring[[k]] <- c(modstring1,
paste(prsnames[[j]], collapse = " + "),
paste(prsnames[[j]][k], collapse = " + "),
paste(covariates, collapse = " + "),
OR(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
LCL_OR(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
UCL_OR(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
Pval(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
coef(get(paste0("Model_", i, "_", j)))[[prsnames[[j]][k]]],
summary(get(paste0("Model_", i, "_", j)))$coefficients[prsnames[[j]][k], 2])
}
}
tmplist[[j]] <- modstring
variables <- c(prsnames[[j]], covariates2)
f <- as.formula(paste("GOUT", paste(variables, collapse = " + "), sep = " ~ "))
assign(paste0("Model_", i, "_", j, "_adj"), glm(f, family = binomial, data = gout_data_list[[i]]))
modstring1 <- c(names(gout_data_list)[[i]],
nrow(gout_data_list[[i]]),
nrow(gout_data_list[[i]] %>% filter(GOUT)),
nrow(gout_data_list[[i]] %>% filter(!GOUT)),
"Gout")
modstring <- list()
if(any(str_detect(prsnames[[j]], "PRS"))) {
modstring[[1]] <- c(modstring1,
paste(prsnames[[j]], collapse = " + "),
paste(prsnames[[j]], collapse = " + "),
paste(covariates2, collapse = " + "),
OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]]),
LCL_OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]]),
UCL_OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]]),
Pval(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]]),
coef(get(paste0("Model_", i, "_", j, "_adj")))[[prsnames[[j]]]],
summary(get(paste0("Model_", i, "_", j, "_adj")))$coefficients[prsnames[[j]], 2])
} else{
for(k in 1:length(prsnames[[j]])){
modstring[[k]] <- c(modstring1,
paste(prsnames[[j]], collapse = " + "),
paste(prsnames[[j]][k], collapse = " + "),
paste(covariates2, collapse = " + "),
OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]][k]),
LCL_OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]][k]),
UCL_OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]][k]),
Pval(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]][k]),
coef(get(paste0("Model_", i, "_", j, "_adj")))[[prsnames[[j]][k]]],
summary(get(paste0("Model_", i, "_", j, "_adj")))$coefficients[prsnames[[j]][k], 2])
}
}
tmplist[[j + length(prsnames)]] <- modstring
}
modlist[[i]] <- tmplist
}
remove <- ls()
remove <- as_tibble(remove) %>%
filter(str_detect(value, "Model_"))
remove <- remove$value
rm(list = remove, remove)
tmp <- modlist %>%
flatten() %>%
flatten() %>%
as.data.frame() %>%
data.table::transpose()
colnames(tmp) <- c("Cohort", "N", "N case", "N control", "Outcome", "Predictors", "Predictor", "Covariates", "OR", "LCL", "UCL", "Pval", "log-odds", "SE")
GoutModels <- tmp %>%
mutate(across(c(Cohort, Outcome, Predictors, Predictor, Covariates), factor),
across(c(N, `N case`, `N control`, OR, LCL, UCL, Pval, `log-odds`, SE), as.numeric))
#save(GoutModels, file = here("Output/GoutModels.RData"))
rm(modlist, tmp, tmplist, covariates, covariates2, f, i, j, k, modstring, modstring1, prsnames, variables)
# We need to model the PRS, the PRS without ABCG2, and both ABCG2 variants simultaneously against age at onset and tophi, adjusting for global PCs. It needs to be in males and females separately, and needs to be both adjusted for disease duration (for tophi) and unadjusted. It also needs to run with the 10 oceanian PCs for Polynesian cohorts.
# Onset --------------------------------------------------------
onset_data_list <- list("Aus/NZ European - Male" = data_list[["Aus/NZ European - Gout - Male"]] %>% filter(!is.na(AGE1ATK)),
"Aus/NZ European - Female" = data_list[["Aus/NZ European - Gout - Female"]] %>% filter(!is.na(AGE1ATK)),
"GlobalGout - Male" = data_list[["GlobalGout - Gout - Male"]] %>% filter(!is.na(AGE1ATK)),
"GlobalGout - Female" = data_list[["GlobalGout - Gout - Female"]] %>% filter(!is.na(AGE1ATK)),
"Ardea - LASSO - Male" = data_list[["Ardea - LASSO - Male"]] %>% filter(!is.na(AGE1ATK)),
"Ardea - LASSO - Female" = data_list[["Ardea - LASSO - Female"]] %>% filter(!is.na(AGE1ATK)),
"Ardea - CLEAR1 - Male" = data_list[["Ardea - CLEAR1 - Male"]] %>% filter(!is.na(AGE1ATK)),
"Ardea - CLEAR1 - Female" = data_list[["Ardea - CLEAR1 - Female"]] %>% filter(!is.na(AGE1ATK)),
"Ardea - CLEAR2 - Male" = data_list[["Ardea - CLEAR2 - Male"]] %>% filter(!is.na(AGE1ATK)),
"Ardea - CLEAR2 - Female" = data_list[["Ardea - CLEAR2 - Female"]] %>% filter(!is.na(AGE1ATK)),
"Ardea - CRYSTAL - Male" = data_list[["Ardea - CRYSTAL - Male"]] %>% filter(!is.na(AGE1ATK)),
"Ardea - CRYSTAL - Female" = data_list[["Ardea - CRYSTAL - Female"]] %>% filter(!is.na(AGE1ATK)),
"Ardea - LIGHT - Male" = data_list[["Ardea - LIGHT - Male"]] %>% filter(!is.na(AGE1ATK)),
"Ardea - LIGHT - Female" = data_list[["Ardea - LIGHT - Female"]] %>% filter(!is.na(AGE1ATK)),
"East Polynesian - Male" = data_list[["East Polynesian - Gout - Male"]] %>% filter(!is.na(AGE1ATK)),
"East Polynesian - Female" = data_list[["East Polynesian - Gout - Female"]] %>% filter(!is.na(AGE1ATK)),
"East Polynesian - NP - Male" = data_list[["East Polynesian - Gout - NP - Male"]] %>% filter(!is.na(AGE1ATK)),
"East Polynesian - NP - Female" = data_list[["East Polynesian - Gout - NP - Female"]] %>% filter(!is.na(AGE1ATK)),
"West Polynesian - Male" = data_list[["West Polynesian - Gout - Male"]] %>% filter(!is.na(AGE1ATK)),
"West Polynesian - Female" = data_list[["West Polynesian - Gout - Female"]] %>% filter(!is.na(AGE1ATK)))
for(i in length(onset_data_list):1){
if(nrow(onset_data_list[[i]]) < 20){
onset_data_list[[i]] <- NULL
}
}
modlist <- vector("list", length(onset_data_list))
for(i in seq_along(onset_data_list)){
prsnames <- list("PRS", "PRS_noABCG2", c("rs2231142", "rs10011796"), "Urate_PRS", "Urate_PRS_noABCG2")
covariates <- c()
if(!str_detect(names(onset_data_list)[i], "UK Biobank")){
covariates <- c(covariates, "Geno.PCVector1", "Geno.PCVector2", "Geno.PCVector3", "Geno.PCVector4", "Geno.PCVector5", "Geno.PCVector6", "Geno.PCVector7", "Geno.PCVector8", "Geno.PCVector9", "Geno.PCVector10")
}
if(str_detect(names(onset_data_list)[i], "Polynesian")){
covariates <- c(covariates, "Geno.PCVector1_Oc", "Geno.PCVector2_Oc", "Geno.PCVector3_Oc", "Geno.PCVector4_Oc", "Geno.PCVector5_Oc", "Geno.PCVector6_Oc", "Geno.PCVector7_Oc", "Geno.PCVector8_Oc", "Geno.PCVector9_Oc", "Geno.PCVector10_Oc")
}
tmplist <- vector("list", length(prsnames))
for(j in seq_along(prsnames)) {
variables <- c(prsnames[[j]], covariates)
f <- as.formula(paste("AGE1ATK", paste(variables, collapse = " + "), sep = " ~ "))
assign(paste0("Model_", i, "_", j), lm(f, data = onset_data_list[[i]]))
# columns will be: Cohort, N, Outcome, Predictors, Predictor, Covariates, Beta, LCL, UCL, Pval, standard error
modstring1 <- c(names(onset_data_list)[[i]],
nrow(onset_data_list[[i]]),
"Age at Onset (years)")
modstring <- list()
if(any(str_detect(prsnames[[j]], "PRS"))) {
modstring[[1]] <- c(modstring1,
paste(prsnames[[j]], collapse = " + "),
paste(prsnames[[j]], collapse = " + "),
paste(covariates, collapse = " + "),
Beta(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
LCL(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
UCL(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
Pval(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
summary(get(paste0("Model_", i, "_", j)))$coefficients[prsnames[[j]], 2])
} else{
for(k in 1:length(prsnames[[j]])){
modstring[[k]] <- c(modstring1,
paste(prsnames[[j]], collapse = " + "),
paste(prsnames[[j]][k], collapse = " + "),
paste(covariates, collapse = " + "),
Beta(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
LCL(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
UCL(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
Pval(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
summary(get(paste0("Model_", i, "_", j)))$coefficients[prsnames[[j]][k], 2])
}
}
tmplist[[j]] <- modstring
}
modlist[[i]] <- tmplist
}
remove <- ls()
remove <- as_tibble(remove) %>%
filter(str_detect(value, "Model_"))
remove <- remove$value
rm(list = remove, remove)
tmp <- modlist %>%
flatten() %>%
flatten() %>%
as.data.frame() %>%
data.table::transpose()
colnames(tmp) <- c("Cohort", "N", "Outcome", "Predictors", "Predictor", "Covariates", "Beta", "LCL", "UCL", "Pval", "SE")
OnsetModels <- tmp %>%
mutate(across(c(Cohort, Outcome, Predictors, Predictor, Covariates), factor),
across(c(N, Beta, LCL, UCL, Pval, SE), as.numeric))
save(OnsetModels, file = here("Output/OnsetModels.RData"))
rm(modlist, tmp, tmplist, covariates, f, i, j, k, modstring, modstring1, prsnames, variables)
# Tophi -------------------------------------------------------------------------------
tophi_data_list <- list("Aus/NZ European - Male" = data_list[["Aus/NZ European - Gout - Male"]] %>% filter(!is.na(TOPHIGOUT)),
"Aus/NZ European - Female" = data_list[["Aus/NZ European - Gout - Female"]] %>% filter(!is.na(TOPHIGOUT)),
"GlobalGout - Male" = data_list[["GlobalGout - Gout - Male"]] %>% filter(!is.na(TOPHIGOUT)),
"GlobalGout - Female" = data_list[["GlobalGout - Gout - Female"]] %>% filter(!is.na(TOPHIGOUT)),
"Ardea - LASSO - Male" = data_list[["Ardea - LASSO - Male"]] %>% filter(!is.na(TOPHIGOUT)),
"Ardea - LASSO - Female" = data_list[["Ardea - LASSO - Female"]] %>% filter(!is.na(TOPHIGOUT)),
"Ardea - CLEAR1 - Male" = data_list[["Ardea - CLEAR1 - Male"]] %>% filter(!is.na(TOPHIGOUT)),
"Ardea - CLEAR1 - Female" = data_list[["Ardea - CLEAR1 - Female"]] %>% filter(!is.na(TOPHIGOUT)),
"Ardea - CLEAR2 - Male" = data_list[["Ardea - CLEAR2 - Male"]] %>% filter(!is.na(TOPHIGOUT)),
"Ardea - CLEAR2 - Female" = data_list[["Ardea - CLEAR2 - Female"]] %>% filter(!is.na(TOPHIGOUT)),
"Ardea - LIGHT - Male" = data_list[["Ardea - LIGHT - Male"]] %>% filter(!is.na(TOPHIGOUT)),
"Ardea - LIGHT - Female" = data_list[["Ardea - LIGHT - Female"]] %>% filter(!is.na(TOPHIGOUT)),
"East Polynesian - Male" = rbind(data_list[["East Polynesian - Gout - Male"]] %>% filter(!is.na(TOPHIGOUT)), data_list[["East Polynesian - Gout - NP - Male"]] %>% filter(!is.na(TOPHIGOUT))),
"East Polynesian - Female" = rbind(data_list[["East Polynesian - Gout - Female"]] %>% filter(!is.na(TOPHIGOUT)), data_list[["East Polynesian - Gout - NP - Female"]] %>% filter(!is.na(TOPHIGOUT))),
"West Polynesian - Male" = data_list[["West Polynesian - Gout - Male"]] %>% filter(!is.na(TOPHIGOUT)),
"West Polynesian - Female" = data_list[["West Polynesian - Gout - Female"]] %>% filter(!is.na(TOPHIGOUT)))
for(i in length(tophi_data_list):1){
if(nrow(tophi_data_list[[i]]) < 20){
tophi_data_list[[i]] <- NULL
}
}
modlist <- vector("list", length(tophi_data_list))
for(i in seq_along(tophi_data_list)){
prsnames <- list("PRS", "PRS_noABCG2", c("rs2231142", "rs10011796"), "Urate_PRS", "Urate_PRS_noABCG2")
covariates <- c()
if(!str_detect(names(tophi_data_list)[i], "UK Biobank")){
covariates <- c(covariates, "Geno.PCVector1", "Geno.PCVector2", "Geno.PCVector3", "Geno.PCVector4", "Geno.PCVector5", "Geno.PCVector6", "Geno.PCVector7", "Geno.PCVector8", "Geno.PCVector9", "Geno.PCVector10")
}
if(str_detect(names(tophi_data_list)[i], "Polynesian")){
covariates <- c(covariates, "Geno.PCVector1_Oc", "Geno.PCVector2_Oc", "Geno.PCVector3_Oc", "Geno.PCVector4_Oc", "Geno.PCVector5_Oc", "Geno.PCVector6_Oc", "Geno.PCVector7_Oc", "Geno.PCVector8_Oc", "Geno.PCVector9_Oc", "Geno.PCVector10_Oc")
}
covariates2 <- c(covariates, "DURATION")
tmplist <- vector("list", 2 * length(prsnames))
for(j in seq_along(prsnames)) {
variables <- c(prsnames[[j]], covariates)
f <- as.formula(paste("TOPHIGOUT", paste(variables, collapse = " + "), sep = " ~ "))
assign(paste0("Model_", i, "_", j), glm(f, family = binomial, data = tophi_data_list[[i]]))
# columns will be: Cohort, N, N case, N control, Outcome, Predictors, Predictor, Covariates, OR, LCL, UCL, Pval, log-odds, standard error
modstring1 <- c(names(tophi_data_list)[[i]],
nrow(tophi_data_list[[i]]),
nrow(tophi_data_list[[i]] %>% filter(TOPHIGOUT)),
nrow(tophi_data_list[[i]] %>% filter(!TOPHIGOUT)),
"Tophi")
modstring <- list()
if(any(str_detect(prsnames[[j]], "PRS"))) {
modstring[[1]] <- c(modstring1,
paste(prsnames[[j]], collapse = " + "),
paste(prsnames[[j]], collapse = " + "),
paste(covariates, collapse = " + "),
OR(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
LCL_OR(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
UCL_OR(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
Pval(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
coef(get(paste0("Model_", i, "_", j)))[[prsnames[[j]]]],
summary(get(paste0("Model_", i, "_", j)))$coefficients[prsnames[[j]], 2])
} else{
for(k in 1:length(prsnames[[j]])){
modstring[[k]] <- c(modstring1,
paste(prsnames[[j]], collapse = " + "),
paste(prsnames[[j]][k], collapse = " + "),
paste(covariates, collapse = " + "),
OR(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
LCL_OR(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
UCL_OR(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
Pval(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
coef(get(paste0("Model_", i, "_", j)))[[prsnames[[j]][k]]],
summary(get(paste0("Model_", i, "_", j)))$coefficients[prsnames[[j]][k], 2])
}
}
tmplist[[j]] <- modstring
variables <- c(prsnames[[j]], covariates2)
f <- as.formula(paste("TOPHIGOUT", paste(variables, collapse = " + "), sep = " ~ "))
assign(paste0("Model_", i, "_", j, "_adj"), glm(f, family = binomial, data = tophi_data_list[[i]]))
modstring1 <- c(names(tophi_data_list)[[i]],
nrow(tophi_data_list[[i]]),
nrow(tophi_data_list[[i]] %>% filter(TOPHIGOUT)),
nrow(tophi_data_list[[i]] %>% filter(!TOPHIGOUT)),
"Tophi")
modstring <- list()
if(any(str_detect(prsnames[[j]], "PRS"))) {
modstring[[1]] <- c(modstring1,
paste(prsnames[[j]], collapse = " + "),
paste(prsnames[[j]], collapse = " + "),
paste(covariates2, collapse = " + "),
OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]]),
LCL_OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]]),
UCL_OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]]),
Pval(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]]),
coef(get(paste0("Model_", i, "_", j, "_adj")))[[prsnames[[j]]]],
summary(get(paste0("Model_", i, "_", j, "_adj")))$coefficients[prsnames[[j]], 2])
} else{
for(k in 1:length(prsnames[[j]])){
modstring[[k]] <- c(modstring1,
paste(prsnames[[j]], collapse = " + "),
paste(prsnames[[j]][k], collapse = " + "),
paste(covariates2, collapse = " + "),
OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]][k]),
LCL_OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]][k]),
UCL_OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]][k]),
Pval(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]][k]),
coef(get(paste0("Model_", i, "_", j, "_adj")))[[prsnames[[j]][k]]],
summary(get(paste0("Model_", i, "_", j, "_adj")))$coefficients[prsnames[[j]][k], 2])
}
}
tmplist[[j + length(prsnames)]] <- modstring
}
modlist[[i]] <- tmplist
}
remove <- ls()
remove <- as_tibble(remove) %>%
filter(str_detect(value, "Model_"))
remove <- remove$value
rm(list = remove, remove)
tmp <- modlist %>%
flatten() %>%
flatten() %>%
as.data.frame() %>%
data.table::transpose()
colnames(tmp) <- c("Cohort", "N", "N case", "N control", "Outcome", "Predictors", "Predictor", "Covariates", "OR", "LCL", "UCL", "Pval", "log-odds", "SE")
TophiModels <- tmp %>%
mutate(across(c(Cohort, Outcome, Predictors, Predictor, Covariates), factor),
across(c(N, `N case`, `N control`, OR, LCL, UCL, Pval, `log-odds`, SE), as.numeric))
save(TophiModels, file = here("Output/TophiModels.RData"))
rm(modlist, tmp, tmplist, covariates, covariates2, f, i, j, k, modstring, modstring1, prsnames, variables)
Making Plots and Tables for the Final Manuscript
The following code details the creation of the final tables and plots that were included in the manuscript (including supplementary material).
These include the following figures:
Figure 1 = Forest plot(s) of genetic effect on age at onset, A = full PRS model, B = PRS less ABCG2
Figure 2 = Effect of individual ABCG2 variants, A = rs2231142, B = rs10011796
Figure 3 = Forest plot of PRS effect on presence of tophi (two panels, before and after adjustment for disease duration)
Supp Figure 1 = Manhattan plot
Supp Figure 2 = Tophi ABCG2 models and PRS without ABCG2
Supp Figure 3 = Relationship between flares and PRS (plot as faceted scatter plot, and/or as grouped category plots)
And the following tables:
Table 1 = Gout cohort statistics/demographics (only key variables)
Supp Table 1 = Table describing GWAS results and which SNPs are part of the PRS
Supp Table 2 = Missing data
Supp Table 3 = Full cohort stats
Supp Table 4 = All model results in table format
# Datasets
load(here("Output/all_pheno_prs.RData"))
all_pheno_prs <- all_pheno_prs %>%
mutate(FLARE_CAT = factor(case_when(between(NUMATK, 0, 5) ~ paste0(as.character(NUMATK), " flares in last year"),
between(NUMATK, 6, 11) ~ "One every one to two months",
between(NUMATK, 12, 52) ~ "One or more per month"),
levels = c(paste0(0:5, " flares in last year"),
"One every one to two months",
"One or more per month"),
labels = c(paste0(0:5),
"6 - 11",
"12 - 52"),
ordered = TRUE),
AGE1ATK = case_when(GOUT ~ AGE1ATK),
DURATION = case_when(GOUT ~ DURATION),
NUMATK = case_when(GOUT ~ NUMATK),
TOPHIGOUT = case_when(GOUT ~ TOPHIGOUT),
ULT = case_when(GOUT ~ ULT),
SEX = factor(SEX, levels = c("Male", "Female")),
GOUT2 = factor(GOUT, levels = c(TRUE, FALSE), labels = c("Gout", "Control")),
GROUP = factor(case_when(GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "European Gout",
!GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "European Control",
GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study != "Ngati Porou" ~ "East Polynesian Gout",
!GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study != "Ngati Porou" ~ "East Polynesian Control",
GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study == "Ngati Porou" ~ "East Polynesian Gout - NP",
!GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study == "Ngati Porou" ~ "East Polynesian Control - NP",
GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian Gout",
!GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian Control"),
levels = c("European Gout", "European Control", "East Polynesian Gout", "East Polynesian Control", "East Polynesian Gout - NP", "East Polynesian Control - NP", "West Polynesian Gout", "West Polynesian Control")),
GROUP2 = factor(case_when(GROUP == "European Gout" & SEX == "Male" ~ "European Gout - male",
GROUP == "European Gout" & SEX == "Female" ~ "European Gout - female",
GROUP == "European Control" & SEX == "Male" ~ "European Control - male",
GROUP == "European Control" & SEX == "Female" ~ "European Control - female",
GROUP == "East Polynesian Gout" & SEX == "Male" ~ "East Polynesian Gout - male",
GROUP == "East Polynesian Gout" & SEX == "Female" ~ "East Polynesian Gout - female",
GROUP == "East Polynesian Control" & SEX == "Male" ~ "East Polynesian Control - male",
GROUP == "East Polynesian Control" & SEX == "Female" ~ "East Polynesian Control - female",
GROUP == "East Polynesian Gout - NP" & SEX == "Male" ~ "East Polynesian Gout - NP - male",
GROUP == "East Polynesian Gout - NP" & SEX == "Female" ~ "East Polynesian Gout - NP - female",
GROUP == "East Polynesian Control - NP" & SEX == "Male" ~ "East Polynesian Control - NP - male",
GROUP == "East Polynesian Control - NP" & SEX == "Female" ~ "East Polynesian Control - NP - female",
GROUP == "West Polynesian Gout" & SEX == "Male" ~ "West Polynesian Gout - male",
GROUP == "West Polynesian Gout" & SEX == "Female" ~ "West Polynesian Gout - female",
GROUP == "West Polynesian Control" & SEX == "Male" ~ "West Polynesian Control - male",
GROUP == "West Polynesian Control" & SEX == "Female" ~ "West Polynesian Control - female"),
levels = c("European Gout - male", "European Gout - female", "European Control - male", "European Control - female", "East Polynesian Gout - male", "East Polynesian Gout - female", "East Polynesian Control - male", "East Polynesian Control - female", "East Polynesian Gout - NP - male", "East Polynesian Gout - NP - female", "East Polynesian Control - NP - male", "East Polynesian Control - NP - female", "West Polynesian Gout - male", "West Polynesian Gout - female", "West Polynesian Control - male", "West Polynesian Control - female")),
GROUP3 = factor(case_when(GOUT & SEX == "Male" ~ "Male Gout",
GOUT & SEX == "Female" ~ "Female Gout",
!GOUT & SEX == "Male" ~ "Male Control",
!GOUT & SEX == "Female" ~ "Female Control"),
levels = c("Male Gout", "Female Gout", "Male Control", "Female Control")),
COHORT2 = factor(case_when(GOUT & Pheno.Study == "UK Biobank" ~ "UK Biobank - Gout",
!GOUT & Pheno.Study == "UK Biobank" ~ "UK Biobank - Control",
GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") & Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease") ~ "Aus/NZ - Gout",
!GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") & Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease") ~ "Aus/NZ - Control",
GOUT & Pheno.Study == "EuroGout" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "GlobalGout - Gout",
!GOUT & Pheno.Study == "EuroGout" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "GlobalGout - Control",
Pheno.Study == "Ardea: 401" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - LASSO",
Pheno.Study == "Ardea: CLEAR1" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - CLEAR1",
Pheno.Study == "Ardea: CLEAR2" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - CLEAR2",
Pheno.Study == "Ardea: CRYSTAL" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - CRYSTAL",
Pheno.Study == "Ardea: LIGHT" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - LIGHT",
GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study != "Ngati Porou" ~ "East Polynesian - Gout",
!GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study != "Ngati Porou" ~ "East Polynesian - Control",
GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study == "Ngati Porou" ~ "East Polynesian - Gout - NP",
!GOUT & Geno.SpecificAncestry %in% c("East Polynesian") & Pheno.Study == "Ngati Porou" ~ "East Polynesian - Control - NP",
GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian - Gout",
!GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian - Control"),
levels = c("UK Biobank - Gout", "UK Biobank - Control", "Aus/NZ - Gout", "Aus/NZ - Control", "GlobalGout - Gout", "GlobalGout - Control", "Ardea - LASSO", "Ardea - CLEAR1", "Ardea - CLEAR2", "Ardea - CRYSTAL", "Ardea - LIGHT", "East Polynesian - Gout", "East Polynesian - Control", "East Polynesian - Gout - NP", "East Polynesian - Control - NP", "West Polynesian - Gout", "West Polynesian - Control")),
NUMATK = round(NUMATK)) %>%
filter(!is.na(AGECOL),
!is.na(PRS),
(Pheno.Study == "UK Biobank" | !GOUT | GOUT & !(is.na(AGE1ATK) & is.na(NUMATK) & is.na(TOPHIGOUT))),
!is.na(COHORT2))
all_pheno_prs_male <- all_pheno_prs %>%
filter(SEX == "Male")
all_pheno_prs_female <- all_pheno_prs %>%
filter(SEX == "Female")
data_list <- list("UK Biobank - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
Pheno.Study == "UK Biobank"),
"UK Biobank - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
Pheno.Study == "UK Biobank"),
"UK Biobank - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
Pheno.Study == "UK Biobank"),
"UK Biobank - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
Pheno.Study == "UK Biobank"),
"Aus/NZ European - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
"Aus/NZ European - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
"Aus/NZ European - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
"Aus/NZ European - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
"GlobalGout - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
Pheno.Study == "EuroGout",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"GlobalGout - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
Pheno.Study == "EuroGout",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"GlobalGout - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
Pheno.Study == "EuroGout",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"GlobalGout - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
Pheno.Study == "EuroGout",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - LASSO - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: 401",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - LASSO - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: 401",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - CLEAR1 - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: CLEAR1",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - CLEAR1 - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: CLEAR1",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - CLEAR2 - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: CLEAR2",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - CLEAR2 - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: CLEAR2",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - CRYSTAL - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: CRYSTAL",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - CRYSTAL - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: CRYSTAL",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - LIGHT - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: LIGHT",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"Ardea - LIGHT - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: LIGHT",
Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
"East Polynesian - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study != "Ngati Porou"),
"East Polynesian - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study != "Ngati Porou"),
"East Polynesian - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study != "Ngati Porou"),
"East Polynesian - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study != "Ngati Porou"),
"East Polynesian - Gout - NP - Male" = all_pheno_prs_male %>% filter(GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study == "Ngati Porou"),
"East Polynesian - Gout - NP - Female" = all_pheno_prs_female %>% filter(GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study == "Ngati Porou"),
"East Polynesian - Control - NP - Male" = all_pheno_prs_male %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study == "Ngati Porou"),
"East Polynesian - Control - NP - Female" = all_pheno_prs_female %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("East Polynesian"),
Pheno.Study == "Ngati Porou"),
"West Polynesian - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
"West Polynesian - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
"West Polynesian - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
"West Polynesian - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")))
cohortstring <- c("UK Biobank - Gout - Male",
"UK Biobank - Gout - Female",
"UK Biobank - Control - Male",
"UK Biobank - Control - Female",
"Aus/NZ European - Gout - Male",
"Aus/NZ European - Gout - Female",
"Aus/NZ European - Control - Male",
"Aus/NZ European - Control - Female",
"GlobalGout - Gout - Male",
"GlobalGout - Gout - Female",
"GlobalGout - Control - Male",
"GlobalGout - Control - Female",
"Ardea - LASSO - Male",
"Ardea - LASSO - Female",
"Ardea - CLEAR1 - Male",
"Ardea - CLEAR1 - Female",
"Ardea - CLEAR2 - Male",
"Ardea - CLEAR2 - Female",
"Ardea - CRYSTAL - Male",
"Ardea - CRYSTAL - Female",
"Ardea - LIGHT - Male",
"Ardea - LIGHT - Female",
"East Polynesian - Gout - Male",
"East Polynesian - Gout - Female",
"East Polynesian - Control - Male",
"East Polynesian - Control - Female",
"East Polynesian - Gout - NP - Male",
"East Polynesian - Gout - NP - Female",
"East Polynesian - Control - NP - Male",
"East Polynesian - Control - NP - Female",
"West Polynesian - Gout - Male",
"West Polynesian - Gout - Female",
"West Polynesian - Control - Male",
"West Polynesian - Control - Female")
clean_names <- c("UK Biobank<br/>Gout<br/>Male",
"UK Biobank<br/>Gout<br/>Female",
"UK Biobank<br/>Control<br/>Male",
"UK Biobank<br/>Control<br/>Female",
"Aus/NZ European<br/>Gout<br/>Male",
"Aus/NZ European<br/>Gout<br/>Female",
"Aus/NZ European<br/>Control<br/>Male",
"Aus/NZ European<br/>Control<br/>Female",
"GlobalGout<br/>Gout<br/>Male",
"GlobalGout<br/>Gout<br/>Female",
"GlobalGout<br/>Control<br/>Male",
"GlobalGout<br/>Control<br/>Female",
"Ardea<br/>LASSO<br/>Gout<br/>Male",
"Ardea<br/>LASSO<br/>Gout<br/>Female",
"Ardea<br/>CLEAR1<br/>Gout<br/>Male",
"Ardea<br/>CLEAR1<br/>Gout<br/>Female",
"Ardea<br/>CLEAR2<br/>Gout<br/>Male",
"Ardea<br/>CLEAR2<br/>Gout<br/>Female",
"Ardea<br/>CRYSTAL<br/>Gout<br/>Male",
"Ardea<br/>CRYSTAL<br/>Gout<br/>Female",
"Ardea<br/>LIGHT<br/>Gout<br/>Male",
"Ardea<br/>LIGHT<br/>Gout<br/>Female",
"East Polynesian<br/>Gout<br/>Male",
"East Polynesian<br/>Gout<br/>Female",
"East Polynesian<br/>Control<br/>Male",
"East Polynesian<br/>Control<br/>Female",
"East Polynesian<br/>Gout<br/>Male<br/>NP",
"East Polynesian<br/>Gout<br/>Female<br/>NP",
"East Polynesian<br/>Control<br/>Male<br/>NP",
"East Polynesian<br/>Control<br/>Female<br/>NP",
"West Polynesian<br/>Gout<br/>Male",
"West Polynesian<br/>Gout<br/>Female",
"West Polynesian<br/>Control<br/>Male",
"West Polynesian<br/>Control<br/>Female")
load(here("Output/UKBB_Gene_OR.RData"))
load(here("Output/Tin_Gene_OR.RData"))
# Model results
load(here("Output/GoutModels.RData"))
load(here("Output/OnsetModels.RData"))
load(here("Output/TophiModels.RData"))
# Functions
report <- function(x) {
if(sum(is.na(x)) != length(x)) {
paste0(sprintf(mean(x, na.rm = TRUE), fmt = "%#.1f"), " ± ", sprintf(sd(x, na.rm = TRUE), fmt = "%#.1f"))
} else {
paste0("NA")
}
}
report_median <- function(x) {
if(sum(is.na(x)) != length(x)) {
paste0(median(x, na.rm =T), " (", summary(x)[[2]], " - ", summary(x)[[5]], ")")
} else {
paste0("NA")
}
}
sumreport <- function(x) {
if(sum(is.na(x)) != length(x)){
paste0(sum(x, na.rm = TRUE), " (", sprintf((mean(x, na.rm = TRUE) * 100), fmt = "%#.1f"), ")")
} else {
paste0("NA")
}
}
transpose_df <- function(df) {
t_df <- data.table::transpose(df)
colnames(t_df) <- rownames(df)
rownames(t_df) <- colnames(df)
t_df <- t_df %>%
rownames_to_column() %>%
as_tibble() %>%
row_to_names(row_number = 1)
return(t_df)
}
missing <- function(x){
if(sum(is.na(x)) == length(x)) {
return("+")
} else if(sum(!is.na(x)) == length(x)){
return("-")
} else {
paste0(format(sum(is.na(x)), big.mark = ","), " (", format(round((sum(is.na(x)) / length(x) * 100), digits = 1), nsmall = 1), ")")
}
}
Main figures and tables
The following forest plots are for the effect of the PRS on either age at onset or tophi, with tophi models additionally adjusted for duration. Plotting the effect of the individual ABCG2 variants on each severity trait. Require P < 0.05/2 for significance. Where appropriate, these were tested via meta-analysis to reflect the results from the full PRS models.
# Onset vs full PRS
tmp <- OnsetModels %>%
filter(Predictor == "PRS") %>%
mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
str_detect(Cohort, "Female") ~ "Female"),
Cohort = str_remove(Cohort, " - Male| - Female"),
Beta = case_when(is.na(LCL) ~ NA_real_,
TRUE ~ Beta),
Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
!str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
str_detect(Cohort, "East Polynesian") & Sex == "Male" ~ "East Polynesian Male",
str_detect(Cohort, "East Polynesian") & Sex == "Female" ~ "East Polynesian Female",
str_detect(Cohort, "West Polynesian") & Sex == "Male" ~ "West Polynesian Male",
str_detect(Cohort, "West Polynesian") & Sex == "Female" ~ "West Polynesian Female"))
onset <- metagen(TE = Beta,
seTE = SE,
studlab = Cohort,
subgroup = Label,
data = tmp)
tiff(file = here("Output/Plots/OnsetvsPRS.tiff"), units = "in", width = 9, height = 9.5, res = 300)
forest(onset,
xlim = c(-15, 5),
random = F,
print.tau2 = F,
print.I2 = F,
col.square.lines = "black",
col.diamond.lines = "black",
col.diamond = "#1e6b52",
col.square = "#aa9767",
col.study = "gray60",
col.by = "black",
leftcols = c("studlab", "N"),
rightcols = c("effect", "ci"),
leftlabs = c("Effect of PRS on Age at Onset", "N"),
rightlabs = c("Beta (years)", "[95% CI]"),
addrows.below.overall = 2,
overall = F)
dev.off()
# Onset vs PRS less ABCG2
tmp <- OnsetModels %>%
filter(Predictor == "PRS_noABCG2") %>%
mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
str_detect(Cohort, "Female") ~ "Female"),
Cohort = str_remove(Cohort, " - Male| - Female"),
Beta = case_when(is.na(LCL) ~ NA_real_,
TRUE ~ Beta),
Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
!str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
str_detect(Cohort, "East Polynesian") & Sex == "Male" ~ "East Polynesian Male",
str_detect(Cohort, "East Polynesian") & Sex == "Female" ~ "East Polynesian Female",
str_detect(Cohort, "West Polynesian") & Sex == "Male" ~ "West Polynesian Male",
str_detect(Cohort, "West Polynesian") & Sex == "Female" ~ "West Polynesian Female"))
onset <- metagen(TE = Beta,
seTE = SE,
studlab = Cohort,
subgroup = Label,
data = tmp)
tiff(file = here("Output/Plots/OnsetvsPRS2.tiff"), units = "in", width = 9, height = 9.5, res = 300)
forest(onset,
xlim = c(-15, 5),
random = F,
print.tau2 = F,
print.I2 = F,
col.square.lines = "black",
col.diamond.lines = "black",
col.diamond = "#1e6b52",
col.square = "#aa9767",
col.study = "gray60",
col.by = "black",
leftcols = c("studlab", "N"),
rightcols = c("effect", "ci"),
leftlabs = c("Effect of PRS on Age at Onset (no ABCG2)", "N"),
rightlabs = c("Beta (years)", "[95% CI]"),
addrows.below.overall = 2,
overall = F)
dev.off()
# Onset vs Tin Urate PRS
tmp <- OnsetModels %>%
filter(Predictor == "Urate_PRS") %>%
mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
str_detect(Cohort, "Female") ~ "Female"),
Cohort = str_remove(Cohort, " - Male| - Female"),
Beta = case_when(is.na(LCL) ~ NA_real_,
TRUE ~ Beta),
Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
!str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
str_detect(Cohort, "East Polynesian") & Sex == "Male" ~ "East Polynesian Male",
str_detect(Cohort, "East Polynesian") & Sex == "Female" ~ "East Polynesian Female",
str_detect(Cohort, "West Polynesian") & Sex == "Male" ~ "West Polynesian Male",
str_detect(Cohort, "West Polynesian") & Sex == "Female" ~ "West Polynesian Female"))
onset <- metagen(TE = Beta,
seTE = SE,
studlab = Cohort,
subgroup = Label,
data = tmp)
tiff(file = here("Output/Plots/OnsetvsPRS3.tiff"), units = "in", width = 9, height = 9.5, res = 300)
forest(onset,
xlim = c(-15, 5),
random = F,
print.tau2 = F,
print.I2 = F,
col.square.lines = "black",
col.diamond.lines = "black",
col.diamond = "#1e6b52",
col.square = "#aa9767",
col.study = "gray60",
col.by = "black",
leftcols = c("studlab", "N"),
rightcols = c("effect", "ci"),
leftlabs = c("Effect of Tin Urate PRS on Age at Onset", "N"),
rightlabs = c("Beta (years)", "[95% CI]"),
addrows.below.overall = 2,
overall = F)
dev.off()
# ABCG2 variants
tmp <- OnsetModels %>%
filter(str_detect(Predictor, "rs")) %>%
mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
str_detect(Cohort, "Female") ~ "Female"),
Cohort = str_remove(Cohort, " - Male| - Female"),
Beta = case_when(is.na(LCL) ~ NA_real_,
TRUE ~ Beta),
Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
!str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
str_detect(Cohort, "East Polynesian") & Sex == "Male" ~ "East Polynesian Male",
str_detect(Cohort, "East Polynesian") & Sex == "Female" ~ "East Polynesian Female",
str_detect(Cohort, "West Polynesian") & Sex == "Male" ~ "West Polynesian Male",
str_detect(Cohort, "West Polynesian") & Sex == "Female" ~ "West Polynesian Female"))
snps <- unique(as.character(tmp$Predictor))
for(i in snps){
tmp2 <- tmp %>%
filter(Predictor == i)
assign(paste0(i), metagen(TE = Beta,
seTE = SE,
studlab = Cohort,
subgroup = Label,
data = tmp2))
}
tiff(file = here(paste0("Output/Plots/Onsetvs", snps[1], ".tiff")), units = "in", width = 9, height = 9.5, res = 300)
forest(get(snps[1]),
xlim = c(-15, 5),
random = F,
print.tau2 = F,
print.I2 = F,
col.square.lines = "black",
col.diamond.lines = "black",
col.diamond = "#1e6b52",
col.square = "#aa9767",
col.study = "gray60",
col.by = "black",
leftcols = c("studlab", "N"),
rightcols = c("effect", "ci"),
leftlabs = c(paste0("Δ Age at Onset per ", snps[1], " allele"), "N"),
rightlabs = c("Beta (years)", "[95% CI]"),
addrows.below.overall = 2,
overall = F)
dev.off()
tiff(file = here(paste0("Output/Plots/Onsetvs", snps[2], ".tiff")), units = "in", width = 9, height = 9.5, res = 300)
forest(get(paste0(snps[2])),
xlim = c(-15, 5),
random = F,
print.tau2 = F,
print.I2 = F,
col.square.lines = "black",
col.diamond.lines = "black",
col.diamond = "#1e6b52",
col.square = "#aa9767",
col.study = "gray60",
col.by = "black",
leftcols = c("studlab", "N"),
rightcols = c("effect", "ci"),
leftlabs = c(paste0("Δ Age at Onset per ", snps[2], " allele"), "N"),
rightlabs = c("Beta (years)", "[95% CI]"),
addrows.below.overall = 2,
overall = F)
dev.off()
# Tophi vs full PRS (unadjusted)
tmp <- TophiModels %>%
filter(Predictor == "PRS",
!str_detect(Covariates, "DURATION"),
`N case` > 20,
`N control` > 20) %>%
mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
str_detect(Cohort, "Female") ~ "Female"),
Cohort = str_remove(Cohort, " - Male| - Female"),
`log-odds` = case_when(LCL == 0 ~ NA_real_,
TRUE ~ `log-odds`),
Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
!str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
str_detect(Cohort, "East Polynesian") & Sex == "Male" ~ "East Polynesian Male",
str_detect(Cohort, "East Polynesian") & Sex == "Female" ~ "East Polynesian Female",
str_detect(Cohort, "West Polynesian") & Sex == "Male" ~ "West Polynesian Male",
str_detect(Cohort, "West Polynesian") & Sex == "Female" ~ "West Polynesian Female"))
tophi <- metagen(TE = `log-odds`,
seTE = SE,
studlab = Cohort,
subgroup = Label,
data = tmp,
sm = "OR")
tiff(file = here("Output/Plots/TophivsPRS.tiff"), units = "in", width = 9, height = 9.5, res = 300)
forest(tophi,
xlim = c(0.2, 5),
random = F,
print.tau2 = F,
print.I2 = F,
col.square.lines = "black",
col.diamond.lines = "black",
col.diamond = "#1e6b52",
col.square = "#aa9767",
col.study = "gray60",
col.by = "black",
leftcols = c("studlab", "N case", "N control"),
rightcols = c("effect", "ci"),
leftlabs = c("Effect of PRS on Tophaceous Gout", "N case", "N control"),
rightlabs = c("OR", "[95% CI]"),
addrows.below.overall = 2,
overall = F)
dev.off()
# Tophi vs full PRS (adjusted)
tmp <- TophiModels %>%
filter(Predictor == "PRS",
str_detect(Covariates, "DURATION"),
`N case` > 20,
`N control` > 20) %>%
mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
str_detect(Cohort, "Female") ~ "Female"),
Cohort = str_remove(Cohort, " - Male| - Female"),
`log-odds` = case_when(LCL == 0 ~ NA_real_,
TRUE ~ `log-odds`),
Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
!str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
str_detect(Cohort, "East Polynesian") & Sex == "Male" ~ "East Polynesian Male",
str_detect(Cohort, "East Polynesian") & Sex == "Female" ~ "East Polynesian Female",
str_detect(Cohort, "West Polynesian") & Sex == "Male" ~ "West Polynesian Male",
str_detect(Cohort, "West Polynesian") & Sex == "Female" ~ "West Polynesian Female"))
tophi_adj <- metagen(TE = `log-odds`,
seTE = SE,
studlab = Cohort,
subgroup = Label,
data = tmp,
sm = "OR")
tiff(file = here("Output/Plots/TophivsPRS_adj.tiff"), units = "in", width = 9, height = 9.5, res = 300)
forest(tophi_adj,
xlim = c(0.2, 5),
random = F,
print.tau2 = F,
print.I2 = F,
col.square.lines = "black",
col.diamond.lines = "black",
col.diamond = "#1e6b52",
col.square = "#aa9767",
col.study = "gray60",
col.by = "black",
leftcols = c("studlab", "N case", "N control"),
rightcols = c("effect", "ci"),
leftlabs = c("Effect of PRS on Tophaceous Gout (Duration Adjusted)", "N case", "N control"),
rightlabs = c("OR", "[95% CI]"),
addrows.below.overall = 2,
overall = F)
dev.off()
# Tophi vs PRS less ABCG2 (unadjusted)
tmp <- TophiModels %>%
filter(Predictor == "PRS_noABCG2",
!str_detect(Covariates, "DURATION"),
`N case` > 20,
`N control` > 20) %>%
mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
str_detect(Cohort, "Female") ~ "Female"),
Cohort = str_remove(Cohort, " - Male| - Female"),
`log-odds` = case_when(LCL == 0 ~ NA_real_,
TRUE ~ `log-odds`),
Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
!str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
str_detect(Cohort, "East Polynesian") & Sex == "Male" ~ "East Polynesian Male",
str_detect(Cohort, "East Polynesian") & Sex == "Female" ~ "East Polynesian Female",
str_detect(Cohort, "West Polynesian") & Sex == "Male" ~ "West Polynesian Male",
str_detect(Cohort, "West Polynesian") & Sex == "Female" ~ "West Polynesian Female"))
tophi <- metagen(TE = `log-odds`,
seTE = SE,
studlab = Cohort,
subgroup = Label,
data = tmp,
sm = "OR")
tiff(file = here("Output/Plots/TophivsPRS2.tiff"), units = "in", width = 9, height = 9.5, res = 300)
forest(tophi,
xlim = c(0.2, 5),
random = F,
print.tau2 = F,
print.I2 = F,
col.square.lines = "black",
col.diamond.lines = "black",
col.diamond = "#1e6b52",
col.square = "#aa9767",
col.study = "gray60",
col.by = "black",
leftcols = c("studlab", "N case", "N control"),
rightcols = c("effect", "ci"),
leftlabs = c("Effect of PRS on Tophaceous Gout (no ABCG2)", "N case", "N control"),
rightlabs = c("OR", "[95% CI]"),
addrows.below.overall = 2,
overall = F)
dev.off()
# Tophi vs PRS less ABCG2 (adjusted)
tmp <- TophiModels %>%
filter(Predictor == "PRS_noABCG2",
str_detect(Covariates, "DURATION"),
`N case` > 20,
`N control` > 20) %>%
mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
str_detect(Cohort, "Female") ~ "Female"),
Cohort = str_remove(Cohort, " - Male| - Female"),
`log-odds` = case_when(LCL == 0 ~ NA_real_,
TRUE ~ `log-odds`),
Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
!str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
str_detect(Cohort, "East Polynesian") & Sex == "Male" ~ "East Polynesian Male",
str_detect(Cohort, "East Polynesian") & Sex == "Female" ~ "East Polynesian Female",
str_detect(Cohort, "West Polynesian") & Sex == "Male" ~ "West Polynesian Male",
str_detect(Cohort, "West Polynesian") & Sex == "Female" ~ "West Polynesian Female"))
tophi_adj <- metagen(TE = `log-odds`,
seTE = SE,
studlab = Cohort,
subgroup = Label,
data = tmp,
sm = "OR")
tiff(file = here("Output/Plots/TophivsPRS2_adj.tiff"), units = "in", width = 10, height = 9.5, res = 300)
forest(tophi_adj,
xlim = c(0.2, 5),
random = F,
print.tau2 = F,
print.I2 = F,
col.square.lines = "black",
col.diamond.lines = "black",
col.diamond = "#1e6b52",
col.square = "#aa9767",
col.study = "gray60",
col.by = "black",
leftcols = c("studlab", "N case", "N control"),
rightcols = c("effect", "ci"),
leftlabs = c("Effect of PRS on Tophaceous Gout (no ABCG2 + Duration Adjusted)", "N case", "N control"),
rightlabs = c("OR", "[95% CI]"),
addrows.below.overall = 2,
overall = F)
dev.off()
# Tophi vs Urate PRS (Unadjusted)
tmp <- TophiModels %>%
filter(Predictor == "Urate_PRS",
!str_detect(Covariates, "DURATION"),
`N case` > 20,
`N control` > 20) %>%
mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
str_detect(Cohort, "Female") ~ "Female"),
Cohort = str_remove(Cohort, " - Male| - Female"),
`log-odds` = case_when(LCL == 0 ~ NA_real_,
TRUE ~ `log-odds`),
Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
!str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
str_detect(Cohort, "East Polynesian") & Sex == "Male" ~ "East Polynesian Male",
str_detect(Cohort, "East Polynesian") & Sex == "Female" ~ "East Polynesian Female",
str_detect(Cohort, "West Polynesian") & Sex == "Male" ~ "West Polynesian Male",
str_detect(Cohort, "West Polynesian") & Sex == "Female" ~ "West Polynesian Female"))
tophi <- metagen(TE = `log-odds`,
seTE = SE,
studlab = Cohort,
subgroup = Label,
data = tmp,
sm = "OR")
tiff(file = here("Output/Plots/TophivsPRS3.tiff"), units = "in", width = 9, height = 9.5, res = 300)
forest(tophi,
xlim = c(0.2, 5),
random = F,
print.tau2 = F,
print.I2 = F,
col.square.lines = "black",
col.diamond.lines = "black",
col.diamond = "#1e6b52",
col.square = "#aa9767",
col.study = "gray60",
col.by = "black",
leftcols = c("studlab", "N case", "N control"),
rightcols = c("effect", "ci"),
leftlabs = c("Effect of Tin Urate PRS on Tophaceous Gout", "N case", "N control"),
rightlabs = c("OR", "[95% CI]"),
addrows.below.overall = 2,
overall = F)
dev.off()
# Tophi vs Urate_PRS (adjusted)
tmp <- TophiModels %>%
filter(Predictor == "Urate_PRS",
str_detect(Covariates, "DURATION"),
`N case` > 20,
`N control` > 20) %>%
mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
str_detect(Cohort, "Female") ~ "Female"),
Cohort = str_remove(Cohort, " - Male| - Female"),
`log-odds` = case_when(LCL == 0 ~ NA_real_,
TRUE ~ `log-odds`),
Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
!str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
str_detect(Cohort, "East Polynesian") & Sex == "Male" ~ "East Polynesian Male",
str_detect(Cohort, "East Polynesian") & Sex == "Female" ~ "East Polynesian Female",
str_detect(Cohort, "West Polynesian") & Sex == "Male" ~ "West Polynesian Male",
str_detect(Cohort, "West Polynesian") & Sex == "Female" ~ "West Polynesian Female"))
tophi_adj <- metagen(TE = `log-odds`,
seTE = SE,
studlab = Cohort,
subgroup = Label,
data = tmp,
sm = "OR")
tiff(file = here("Output/Plots/TophivsPRS3_adj.tiff"), units = "in", width = 10, height = 9.5, res = 300)
forest(tophi_adj,
xlim = c(0.2, 5),
random = F,
print.tau2 = F,
print.I2 = F,
col.square.lines = "black",
col.diamond.lines = "black",
col.diamond = "#1e6b52",
col.square = "#aa9767",
col.study = "gray60",
col.by = "black",
leftcols = c("studlab", "N case", "N control"),
rightcols = c("effect", "ci"),
leftlabs = c("Effect of Tin Urate PRS on Tophaceous Gout (Duration Adjusted)", "N case", "N control"),
rightlabs = c("OR", "[95% CI]"),
addrows.below.overall = 2,
overall = F)
dev.off()
# Tophi vs rs2231142 (unadjusted)
tmp <- TophiModels %>%
filter(Predictor == "rs2231142",
!str_detect(Covariates, "DURATION"),
`N case` > 20,
`N control` > 20) %>%
mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
str_detect(Cohort, "Female") ~ "Female"),
Cohort = str_remove(Cohort, " - Male| - Female"),
`log-odds` = case_when(LCL == 0 ~ NA_real_,
TRUE ~ `log-odds`),
Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
!str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
str_detect(Cohort, "East Polynesian") & Sex == "Male" ~ "East Polynesian Male",
str_detect(Cohort, "East Polynesian") & Sex == "Female" ~ "East Polynesian Female",
str_detect(Cohort, "West Polynesian") & Sex == "Male" ~ "West Polynesian Male",
str_detect(Cohort, "West Polynesian") & Sex == "Female" ~ "West Polynesian Female"))
tophi <- metagen(TE = `log-odds`,
seTE = SE,
studlab = Cohort,
subgroup = Label,
data = tmp,
sm = "OR")
tiff(file = here("Output/Plots/Tophivsrs2231142.tiff"), units = "in", width = 9, height = 9.5, res = 300)
forest(tophi,
xlim = c(0.2, 5),
random = F,
print.tau2 = F,
print.I2 = F,
col.square.lines = "black",
col.diamond.lines = "black",
col.diamond = "#1e6b52",
col.square = "#aa9767",
col.study = "gray60",
col.by = "black",
leftcols = c("studlab", "N case", "N control"),
rightcols = c("effect", "ci"),
leftlabs = c("Effect of rs2231142 on Tophaceous Gout", "N case", "N control"),
rightlabs = c("OR", "[95% CI]"),
addrows.below.overall = 2,
overall = F)
dev.off()
# Tophi vs rs2231142 (adjusted)
tmp <- TophiModels %>%
filter(Predictor == "rs2231142",
str_detect(Covariates, "DURATION"),
`N case` > 20,
`N control` > 20) %>%
mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
str_detect(Cohort, "Female") ~ "Female"),
Cohort = str_remove(Cohort, " - Male| - Female"),
`log-odds` = case_when(LCL == 0 ~ NA_real_,
TRUE ~ `log-odds`),
Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
!str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
str_detect(Cohort, "East Polynesian") & Sex == "Male" ~ "East Polynesian Male",
str_detect(Cohort, "East Polynesian") & Sex == "Female" ~ "East Polynesian Female",
str_detect(Cohort, "West Polynesian") & Sex == "Male" ~ "West Polynesian Male",
str_detect(Cohort, "West Polynesian") & Sex == "Female" ~ "West Polynesian Female"))
tophi_adj <- metagen(TE = `log-odds`,
seTE = SE,
studlab = Cohort,
subgroup = Label,
data = tmp,
sm = "OR")
tiff(file = here("Output/Plots/Tophivsrs2231142_adj.tiff"), units = "in", width = 10, height = 9.5, res = 300)
forest(tophi_adj,
xlim = c(0.2, 5),
random = F,
print.tau2 = F,
print.I2 = F,
col.square.lines = "black",
col.diamond.lines = "black",
col.diamond = "#1e6b52",
col.square = "#aa9767",
col.study = "gray60",
col.by = "black",
leftcols = c("studlab", "N case", "N control"),
rightcols = c("effect", "ci"),
leftlabs = c("Effect of rs2231142 on Tophaceous Gout (Duration Adjusted)", "N case", "N control"),
rightlabs = c("OR", "[95% CI]"),
addrows.below.overall = 2,
overall = F)
dev.off()
# Tophi vs rs10011796 (unadjusted)
tmp <- TophiModels %>%
filter(Predictor == "rs10011796",
!str_detect(Covariates, "DURATION"),
`N case` > 20,
`N control` > 20) %>%
mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
str_detect(Cohort, "Female") ~ "Female"),
Cohort = str_remove(Cohort, " - Male| - Female"),
`log-odds` = case_when(LCL == 0 ~ NA_real_,
TRUE ~ `log-odds`),
Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
!str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
str_detect(Cohort, "East Polynesian") & Sex == "Male" ~ "East Polynesian Male",
str_detect(Cohort, "East Polynesian") & Sex == "Female" ~ "East Polynesian Female",
str_detect(Cohort, "West Polynesian") & Sex == "Male" ~ "West Polynesian Male",
str_detect(Cohort, "West Polynesian") & Sex == "Female" ~ "West Polynesian Female"))
tophi <- metagen(TE = `log-odds`,
seTE = SE,
studlab = Cohort,
subgroup = Label,
data = tmp,
sm = "OR")
tiff(file = here("Output/Plots/Tophivsrs10011796.tiff"), units = "in", width = 9, height = 9.5, res = 300)
forest(tophi,
xlim = c(0.2, 5),
random = F,
print.tau2 = F,
print.I2 = F,
col.square.lines = "black",
col.diamond.lines = "black",
col.diamond = "#1e6b52",
col.square = "#aa9767",
col.study = "gray60",
col.by = "black",
leftcols = c("studlab", "N case", "N control"),
rightcols = c("effect", "ci"),
leftlabs = c("Effect of rs10011796 on Tophaceous Gout", "N case", "N control"),
rightlabs = c("OR", "[95% CI]"),
addrows.below.overall = 2,
overall = F)
dev.off()
# Tophi vs rs10011796 (adjusted)
tmp <- TophiModels %>%
filter(Predictor == "rs10011796",
str_detect(Covariates, "DURATION"),
`N case` > 20,
`N control` > 20) %>%
mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
str_detect(Cohort, "Female") ~ "Female"),
Cohort = str_remove(Cohort, " - Male| - Female"),
`log-odds` = case_when(LCL == 0 ~ NA_real_,
TRUE ~ `log-odds`),
Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
!str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
str_detect(Cohort, "East Polynesian") & Sex == "Male" ~ "East Polynesian Male",
str_detect(Cohort, "East Polynesian") & Sex == "Female" ~ "East Polynesian Female",
str_detect(Cohort, "West Polynesian") & Sex == "Male" ~ "West Polynesian Male",
str_detect(Cohort, "West Polynesian") & Sex == "Female" ~ "West Polynesian Female"))
tophi_adj <- metagen(TE = `log-odds`,
seTE = SE,
studlab = Cohort,
subgroup = Label,
data = tmp,
sm = "OR")
tiff(file = here("Output/Plots/Tophivsrs10011796_adj.tiff"), units = "in", width = 10, height = 9.5, res = 300)
forest(tophi_adj,
xlim = c(0.2, 5),
random = F,
print.tau2 = F,
print.I2 = F,
col.square.lines = "black",
col.diamond.lines = "black",
col.diamond = "#1e6b52",
col.square = "#aa9767",
col.study = "gray60",
col.by = "black",
leftcols = c("studlab", "N case", "N control"),
rightcols = c("effect", "ci"),
leftlabs = c("Effect of rs10011796 on Tophaceous Gout (Duration Adjusted)", "N case", "N control"),
rightlabs = c("OR", "[95% CI]"),
addrows.below.overall = 2,
overall = F)
dev.off()
The following table describes the cohort statistics for every variable that I have deemed to have sufficient non-missing data. If it is missing at more than 50% in a single cohort then that cohort will be set to “too much missing”. If there are no cohorts with fewer than 30% missing (excluding UKBB) then the variable is removed from the plot.
table1 <- tibble("Cohort" = cohortstring,
"N" = unlist(lapply(data_list, nrow)),
"Age at Collection (years)" = unlist(lapply(data_list, function(x) report(x$AGECOL))),
"Serum Urate (mg/dL)" = unlist(lapply(data_list, function(x) report(x$URATE))),
"ULT" = unlist(lapply(data_list, function(x) sumreport(x$ULT))),
"Age at Onset (years)" = unlist(lapply(data_list, function(x) report(x$AGE1ATK))),
"Disease Duration (years)" = unlist(lapply(data_list, function(x) report(x$DURATION))),
"Number of Flares in Last Year" = unlist(lapply(data_list, function(x) report_median(x$NUMATK))),
"Presence of Tophi" = unlist(lapply(data_list, function(x) sumreport(x$TOPHIGOUT))),
"PRS" = unlist(lapply(data_list, function(x) report(x$PRS))),
"BMI" = unlist(lapply(data_list, function(x) report(x$BMI))),
"Type 2 Diabetes" = unlist(lapply(data_list, function(x) sumreport(x$DIABETES))))
table1 <- transpose_df(table1) %>%
column_to_rownames(var = "Cohort") %>%
mutate(across(.cols = 1:ncol(table1), ~ str_replace_all(string = .x, pattern = " ", replacement = " ")))
row.names(table1) <- str_replace_all(row.names(table1), " ", " ")
table1 %>%
kable(col.names = clean_names,
align = "c",
escape = F) %>%
kable_styling("striped") %>%
scroll_box(width = "900px", height = "475px") %>%
footnote("Flares reported as median (inter-quartile range). All other numeric variables reported as mean ± sd. All binary variables reported as N (%).")
|
|
UK Biobank Gout Male
|
UK Biobank Gout Female
|
UK Biobank Control Male
|
UK Biobank Control Female
|
Aus/NZ European Gout Male
|
Aus/NZ European Gout Female
|
Aus/NZ European Control Male
|
Aus/NZ European Control Female
|
GlobalGout Gout Male
|
GlobalGout Gout Female
|
GlobalGout Control Male
|
GlobalGout Control Female
|
Ardea LASSO Gout Male
|
Ardea LASSO Gout Female
|
Ardea CLEAR1 Gout Male
|
Ardea CLEAR1 Gout Female
|
Ardea CLEAR2 Gout Male
|
Ardea CLEAR2 Gout Female
|
Ardea CRYSTAL Gout Male
|
Ardea CRYSTAL Gout Female
|
Ardea LIGHT Gout Male
|
Ardea LIGHT Gout Female
|
East Polynesian Gout Male
|
East Polynesian Gout Female
|
East Polynesian Control Male
|
East Polynesian Control Female
|
East Polynesian Gout Male NP
|
East Polynesian Gout Female NP
|
East Polynesian Control Male NP
|
East Polynesian Control Female NP
|
West Polynesian Gout Male
|
West Polynesian Gout Female
|
West Polynesian Control Male
|
West Polynesian Control Female
|
|
N
|
7930
|
945
|
167249
|
202100
|
978
|
210
|
764
|
619
|
1032
|
124
|
44
|
79
|
819
|
65
|
230
|
16
|
239
|
7
|
175
|
4
|
109
|
8
|
408
|
122
|
241
|
382
|
124
|
28
|
44
|
37
|
436
|
54
|
199
|
180
|
|
Age at Collection (years)
|
59.9 ± 7.0
|
61.9 ± 6.1
|
57.0 ± 8.1
|
56.7 ± 7.9
|
62.4 ± 12.4
|
70.0 ± 12.7
|
54.9 ± 17.1
|
51.3 ± 17.3
|
60.1 ± 13.1
|
67.6 ± 11.1
|
60.0 ± 14.8
|
64.2 ± 11.5
|
51.4 ± 11.8
|
60.7 ± 10.6
|
52.3 ± 11.1
|
61.4 ± 7.4
|
53.0 ± 10.8
|
55.0 ± 16.0
|
53.9 ± 11.0
|
63.8 ± 5.4
|
53.3 ± 11.8
|
64.0 ± 16.1
|
54.3 ± 12.4
|
60.7 ± 11.7
|
43.9 ± 15.6
|
45.6 ± 14.8
|
59.7 ± 11.3
|
59.1 ± 13.3
|
49.8 ± 13.5
|
48.7 ± 17.1
|
47.5 ± 12.3
|
53.4 ± 13.4
|
39.3 ± 15.0
|
40.5 ± 15.4
|
|
Serum Urate (mg/dL)
|
6.7 ± 1.7
|
6.0 ± 2.0
|
5.9 ± 1.2
|
4.5 ± 1.1
|
6.7 ± 1.9
|
6.4 ± 2.3
|
5.5 ± 2.8
|
3.3 ± 2.6
|
7.4 ± 2.3
|
7.7 ± 2.6
|
6.6 ± 1.7
|
6.6 ± 1.7
|
8.9 ± 1.3
|
8.9 ± 1.4
|
7.9 ± 1.4
|
8.1 ± 1.2
|
7.9 ± 1.5
|
8.4 ± 2.0
|
8.8 ± 1.5
|
10.1 ± 1.2
|
9.3 ± 1.7
|
8.1 ± 1.4
|
7.0 ± 2.3
|
6.3 ± 2.5
|
6.5 ± 1.9
|
5.4 ± 1.7
|
7.0 ± 1.7
|
6.9 ± 2.4
|
6.3 ± 1.3
|
5.3 ± 1.3
|
7.7 ± 2.1
|
7.0 ± 2.7
|
6.6 ± 1.8
|
5.3 ± 1.7
|
|
ULT
|
4505 (56.8)
|
372 (39.4)
|
NA
|
NA
|
564 (99.6)
|
108 (98.2)
|
NA
|
NA
|
570 (75.7)
|
49 (59.8)
|
NA
|
NA
|
255 (31.2)
|
26 (40.6)
|
230 (100.0)
|
16 (100.0)
|
239 (100.0)
|
7 (100.0)
|
94 (100.0)
|
1 (100.0)
|
109 (100.0)
|
8 (100.0)
|
262 (92.6)
|
86 (96.6)
|
NA
|
NA
|
96 (93.2)
|
19 (73.1)
|
NA
|
NA
|
292 (94.2)
|
38 (95.0)
|
NA
|
NA
|
|
Age at Onset (years)
|
NA
|
NA
|
NA
|
NA
|
46.4 ± 15.8
|
59.5 ± 15.7
|
NA
|
NA
|
46.5 ± 14.0
|
57.8 ± 12.5
|
NA
|
NA
|
41.4 ± 13.4
|
55.1 ± 12.0
|
41.9 ± 12.4
|
55.2 ± 11.2
|
42.6 ± 13.2
|
46.1 ± 20.6
|
40.1 ± 13.0
|
61.5 ± 6.2
|
42.4 ± 13.1
|
55.2 ± 17.3
|
37.9 ± 14.0
|
49.4 ± 15.4
|
NA
|
NA
|
39.1 ± 15.2
|
46.0 ± 16.8
|
NA
|
NA
|
34.6 ± 12.0
|
44.3 ± 15.0
|
NA
|
NA
|
|
Disease Duration (years)
|
NA
|
NA
|
NA
|
NA
|
16.8 ± 12.7
|
10.9 ± 10.4
|
NA
|
NA
|
14.5 ± 11.4
|
10.6 ± 9.8
|
NA
|
NA
|
11.0 ± 9.4
|
6.6 ± 8.0
|
11.4 ± 9.4
|
7.1 ± 9.5
|
11.4 ± 9.8
|
9.9 ± 11.6
|
14.8 ± 10.0
|
3.2 ± 1.0
|
11.9 ± 8.7
|
9.8 ± 11.0
|
17.2 ± 12.8
|
13.1 ± 13.2
|
NA
|
NA
|
21.7 ± 15.3
|
14.1 ± 12.6
|
NA
|
NA
|
13.6 ± 10.3
|
9.2 ± 9.2
|
NA
|
NA
|
|
Number of Flares in Last Year
|
NA
|
NA
|
NA
|
NA
|
2 (0 - 4)
|
1.5 (0 - 3.25)
|
NA
|
NA
|
2 (1 - 4)
|
2.5 (1 - 4)
|
NA
|
NA
|
4 (3 - 8)
|
3 (3 - 6)
|
3 (2 - 6)
|
3 (3 - 4)
|
4 (2 - 8)
|
5 (3 - 6)
|
4 (3 - 6)
|
4.5 (2.25 - 6)
|
4 (2 - 10)
|
4 (2.75 - 5.25)
|
3 (1 - 6)
|
2 (0 - 5)
|
NA
|
NA
|
2 (0 - 3)
|
3 (1 - 6)
|
NA
|
NA
|
4 (2 - 10)
|
2 (1 - 5)
|
NA
|
NA
|
|
Presence of Tophi
|
NA
|
NA
|
NA
|
NA
|
333 (43.4)
|
67 (39.9)
|
NA
|
NA
|
320 (57.6)
|
46 (62.2)
|
NA
|
NA
|
138 (16.8)
|
5 (7.7)
|
34 (14.9)
|
1 (6.2)
|
54 (22.6)
|
2 (28.6)
|
174 (99.4)
|
4 (100.0)
|
26 (23.9)
|
5 (62.5)
|
144 (41.3)
|
26 (28.3)
|
NA
|
NA
|
9 (12.2)
|
4 (19.0)
|
NA
|
NA
|
177 (44.6)
|
14 (28.6)
|
NA
|
NA
|
|
PRS
|
4.0 ± 0.6
|
4.0 ± 0.7
|
3.7 ± 0.6
|
3.7 ± 0.6
|
4.1 ± 0.7
|
4.0 ± 0.6
|
3.7 ± 0.6
|
3.7 ± 0.6
|
4.0 ± 0.6
|
4.0 ± 0.6
|
3.8 ± 0.6
|
3.8 ± 0.6
|
4.1 ± 0.7
|
4.1 ± 0.6
|
4.2 ± 0.7
|
4.3 ± 0.6
|
4.2 ± 0.6
|
4.3 ± 0.8
|
4.2 ± 0.6
|
4.0 ± 0.4
|
4.1 ± 0.6
|
4.2 ± 0.2
|
4.4 ± 0.5
|
4.4 ± 0.5
|
4.2 ± 0.4
|
4.2 ± 0.5
|
4.2 ± 0.5
|
4.4 ± 0.5
|
4.2 ± 0.5
|
4.1 ± 0.6
|
4.8 ± 0.6
|
4.7 ± 0.6
|
4.3 ± 0.6
|
4.3 ± 0.6
|
|
BMI
|
30.5 ± 4.8
|
32.3 ± 6.6
|
27.7 ± 4.2
|
27.0 ± 5.1
|
30.3 ± 5.2
|
30.9 ± 7.3
|
27.2 ± 4.7
|
27.0 ± 6.2
|
29.4 ± 4.7
|
30.9 ± 6.7
|
NA
|
NA
|
34.1 ± 6.7
|
38.0 ± 10.3
|
34.6 ± 6.1
|
38.1 ± 6.5
|
33.7 ± 6.0
|
36.2 ± 7.5
|
32.2 ± 5.4
|
36.5 ± 3.8
|
31.3 ± 4.9
|
35.7 ± 8.7
|
35.4 ± 8.0
|
38.2 ± 9.8
|
31.9 ± 7.1
|
32.7 ± 8.5
|
35.9 ± 7.7
|
39.5 ± 7.6
|
32.5 ± 7.8
|
29.1 ± 6.1
|
36.1 ± 6.7
|
38.5 ± 9.1
|
33.1 ± 6.2
|
34.3 ± 7.7
|
|
Type 2 Diabetes
|
1431 (20.1)
|
227 (26.7)
|
13115 (8.8)
|
8479 (4.8)
|
144 (15.9)
|
51 (26.4)
|
55 (12.2)
|
48 (13.6)
|
350 (41.1)
|
53 (55.8)
|
NA
|
NA
|
79 (9.6)
|
16 (24.6)
|
30 (13.0)
|
6 (37.5)
|
32 (13.4)
|
0 (0.0)
|
24 (13.7)
|
2 (50.0)
|
12 (11.0)
|
0 (0.0)
|
121 (30.5)
|
61 (50.4)
|
53 (22.8)
|
77 (21.2)
|
38 (100.0)
|
10 (100.0)
|
7 (100.0)
|
1 (100.0)
|
80 (19.0)
|
26 (48.1)
|
33 (17.1)
|
44 (25.6)
|
|
Note:
|
|
Flares reported as median (inter-quartile range). All other numeric variables reported as mean ± sd. All binary variables reported as N (%).
|
Supplementary figures and tables
Manhattan plot for the gout GWAS showing the locations of the SNPs that were used in the PRS.
# Manhattan plot of UKBB gout GWAS
# Preparing data
load(here("Output/sumstat_final.RData"))
GOUT_pValues <- sumstat_final %>%
filter(P < 0.01) %>%
arrange(CHR, BP)
tmp <- GOUT_pValues %>%
filter(RSID %in% UKBB_Gene_OR$RSID) %>%
mutate("Gene" = UKBB_Gene_OR$Locus_Name)
tmp2 <- GOUT_pValues %>%
filter(!(SNP %in% tmp$SNP)) %>%
mutate("Gene" = NA)
GOUT_pValues <- full_join(tmp, tmp2) %>%
arrange(CHR, BP)
GOUT_pValues2 <- GOUT_pValues %>%
# Compute chromosome size
group_by(CHR) %>%
summarise(chr_len = max(BP)) %>%
# Calculate cumulative position of each chromosome
mutate(tot = cumsum(chr_len) - chr_len) %>%
select(-chr_len) %>%
# Add this info to the initial dataset
left_join(GOUT_pValues, ., by = "CHR") %>%
# Add a cumulative position of each SNP
arrange(CHR, BP) %>%
mutate(BPcum = BP + tot) %>%
# Add highlight and annotation information
mutate(is_highlight = ifelse(RSID %in% UKBB_Gene_OR$RSID, "yes", "no")) %>%
mutate(is_annotate = ifelse(RSID %in% UKBB_Gene_OR$RSID, "yes", "no"))
axisdf <- GOUT_pValues2 %>%
group_by(CHR) %>%
summarize(center = (max(BPcum) + min(BPcum)) / 2)
ggplot(GOUT_pValues2, aes(x = BPcum, y = -log10(P))) +
# Show all points
geom_point(aes(color = as.factor(CHR)), alpha = 0.8, size = 1.3) +
scale_color_manual(values = rep(c("#1e6b52", "#aa9767"), 22)) +
geom_hline(yintercept = -log10(5e-8), colour = "red") +
# custom X axis:
scale_x_continuous(label = axisdf$CHR, breaks = axisdf$center) +
scale_y_continuous(expand = c(0, 0), limits = c(0, 250)) + # remove space between plot area and x axis
# Add highlighted points
geom_point(data = subset(GOUT_pValues2, is_highlight == "yes"), color = "orange", size = 2) +
# Add label using ggrepel to avoid overlapping
geom_label_repel(aes(label = Gene), size = 3, box.padding = 0.5, force_pull = 0.5, nudge_y = 3, max.overlaps = Inf) +
xlab("Chromosome") +
ggtitle("UK Biobank Gout GWAS Results") +
# Custom the theme:
theme_bw() +
theme(
legend.position = "none",
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
)

# ggsave(filename = here("Output/Plots/Manhattan.tiff"), dpi = 600)
Table of GWAS results for PRS.
Missing data table
table1 <- tibble("Cohort" = cohortstring,
"N" = unlist(lapply(data_list, function(x) nrow(x))),
"Age at Collection" = unlist(lapply(data_list, function(x) missing(x$AGECOL))),
"Serum Urate" = unlist(lapply(data_list, function(x) missing(x$URATE))),
"ULT" = unlist(lapply(data_list, function(x) missing(x$ULT))),
"Age at Onset" = unlist(lapply(data_list, function(x) missing(x$AGE1ATK))),
"Disease Duration" = unlist(lapply(data_list, function(x) missing(x$DURATION))),
"Flares" = unlist(lapply(data_list, function(x) missing(x$NUMATK))),
"Tophi" = unlist(lapply(data_list, function(x) missing(x$TOPHIGOUT))),
"PRS" = unlist(lapply(data_list, function(x) missing(x$PRS))),
"Prophylaxis" = unlist(lapply(data_list, function(x) missing(x$PROPHY))),
"BMI" = unlist(lapply(data_list, function(x) missing(x$BMI))),
"Hypertension" = unlist(lapply(data_list, function(x) missing(x$HYPERTENSION))),
"Type 2 Diabetes" = unlist(lapply(data_list, function(x) missing(x$DIABETES))),
"Heart Disease" = unlist(lapply(data_list, function(x) missing(x$HEART))),
"Kidney Disease" = unlist(lapply(data_list, function(x) missing(x$KIDNEY))),
"Dyslipidemia" = unlist(lapply(data_list, function(x) missing(x$LIPIDS))),
"Stroke" = unlist(lapply(data_list, function(x) missing(x$STROKE))),
"Alcoholic Drinks / Week" = unlist(lapply(data_list, function(x) missing(x$TOTALALC))),
"Sugar-Sweetened Drinks / Week" = unlist(lapply(data_list, function(x) missing(x$SUGDRINK))),
"Current Smoker" = unlist(lapply(data_list, function(x) missing(x$CURSMOKE))),
"Family History of Gout" = unlist(lapply(data_list, function(x) missing(x$FAMGOUT))),
"No. Relatives w/ Gout" = unlist(lapply(data_list, function(x) missing(x$FAMGOUTNUM))))
table1 <- transpose_df(table1) %>%
column_to_rownames(var = "Cohort")
write_csv(table1, file = here("Output/Tables/missing.csv"))
table1 <- table1 %>%
mutate(across(.cols = 1:ncol(table1), ~ str_replace(string = .x, pattern = " ", replacement = " ")))
row.names(table1) <- str_replace(row.names(table1), " ", " ")
table1 %>%
kable(col.names = clean_names,
align = "c",
escape = F) %>%
kable_styling("striped") %>%
scroll_box(width = "900px", height = "475px") %>%
footnote("'All' = all missing, 'None' = none missing")
|
|
UK Biobank Gout Male
|
UK Biobank Gout Female
|
UK Biobank Control Male
|
UK Biobank Control Female
|
Aus/NZ European Gout Male
|
Aus/NZ European Gout Female
|
Aus/NZ European Control Male
|
Aus/NZ European Control Female
|
GlobalGout Gout Male
|
GlobalGout Gout Female
|
GlobalGout Control Male
|
GlobalGout Control Female
|
Ardea LASSO Gout Male
|
Ardea LASSO Gout Female
|
Ardea CLEAR1 Gout Male
|
Ardea CLEAR1 Gout Female
|
Ardea CLEAR2 Gout Male
|
Ardea CLEAR2 Gout Female
|
Ardea CRYSTAL Gout Male
|
Ardea CRYSTAL Gout Female
|
Ardea LIGHT Gout Male
|
Ardea LIGHT Gout Female
|
East Polynesian Gout Male
|
East Polynesian Gout Female
|
East Polynesian Control Male
|
East Polynesian Control Female
|
East Polynesian Gout Male NP
|
East Polynesian Gout Female NP
|
East Polynesian Control Male NP
|
East Polynesian Control Female NP
|
West Polynesian Gout Male
|
West Polynesian Gout Female
|
West Polynesian Control Male
|
West Polynesian Control Female
|
|
N
|
7930
|
945
|
167249
|
202100
|
978
|
210
|
764
|
619
|
1032
|
124
|
44
|
79
|
819
|
65
|
230
|
16
|
239
|
7
|
175
|
4
|
109
|
8
|
408
|
122
|
241
|
382
|
124
|
28
|
44
|
37
|
436
|
54
|
199
|
180
|
|
Age at Collection
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Serum Urate
|
372 (4.7)
|
58 (6.1)
|
7,828 (4.7)
|
9,738 (4.8)
|
18 (1.8)
|
4 (1.9)
|
36 (4.7)
|
24 (3.9)
|
50 (4.8)
|
8 (6.5)
|
32 (72.7)
|
71 (89.9)
|
4 (0.5)
|
1 (1.5)
|
1 (0.4)
|
|
1 (0.4)
|
|
|
|
|
|
2 (0.5)
|
1 (0.8)
|
35 (14.5)
|
64 (16.8)
|
|
1 (3.6)
|
|
|
1 (0.2)
|
1 (1.9)
|
21 (10.6)
|
16 (8.9)
|
|
ULT
|
|
|
|
|
412 (42.1)
|
100 (47.6)
|
|
|
279 (27.0)
|
42 (33.9)
|
|
|
3 (0.4)
|
1 (1.5)
|
|
|
|
|
81 (46.3)
|
3 (75.0)
|
|
|
125 (30.6)
|
33 (27.0)
|
|
|
21 (16.9)
|
2 (7.1)
|
|
|
126 (28.9)
|
14 (25.9)
|
|
|
|
Age at Onset
|
|
|
|
|
51 (5.2)
|
24 (11.4)
|
|
|
25 (2.4)
|
6 (4.8)
|
|
|
|
|
|
|
|
|
|
|
|
|
27 (6.6)
|
9 (7.4)
|
|
|
1 (0.8)
|
|
|
|
18 (4.1)
|
9 (16.7)
|
|
|
|
Disease Duration
|
|
|
|
|
51 (5.2)
|
24 (11.4)
|
|
|
25 (2.4)
|
6 (4.8)
|
|
|
|
|
|
|
|
|
|
|
|
|
27 (6.6)
|
9 (7.4)
|
|
|
1 (0.8)
|
|
|
|
18 (4.1)
|
9 (16.7)
|
|
|
|
Flares
|
|
|
|
|
119 (12.2)
|
34 (16.2)
|
|
|
72 (7.0)
|
6 (4.8)
|
|
|
|
|
|
|
|
|
|
|
|
|
32 (7.8)
|
14 (11.5)
|
|
|
3 (2.4)
|
2 (7.1)
|
|
|
20 (4.6)
|
9 (16.7)
|
|
|
|
Tophi
|
|
|
|
|
210 (21.5)
|
42 (20.0)
|
|
|
476 (46.1)
|
50 (40.3)
|
|
|
|
|
2 (0.9)
|
|
|
|
|
|
|
|
59 (14.5)
|
30 (24.6)
|
|
|
50 (40.3)
|
7 (25.0)
|
|
|
39 (8.9)
|
5 (9.3)
|
|
|
|
PRS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Prophylaxis
|
|
|
|
|
918 (93.9)
|
196 (93.3)
|
763 (99.9)
|
|
317 (30.7)
|
43 (34.7)
|
|
|
|
|
|
|
|
|
|
|
|
|
348 (85.3)
|
108 (88.5)
|
240 (99.6)
|
|
|
2 (7.1)
|
43 (97.7)
|
|
390 (89.4)
|
45 (83.3)
|
|
179 (99.4)
|
|
BMI
|
30 (0.4)
|
8 (0.8)
|
580 (0.3)
|
606 (0.3)
|
58 (5.9)
|
18 (8.6)
|
174 (22.8)
|
53 (8.6)
|
44 (4.3)
|
2 (1.6)
|
|
|
5 (0.6)
|
|
1 (0.4)
|
|
|
|
|
|
|
|
7 (1.7)
|
4 (3.3)
|
3 (1.2)
|
13 (3.4)
|
2 (1.6)
|
|
4 (9.1)
|
6 (16.2)
|
10 (2.3)
|
3 (5.6)
|
3 (1.5)
|
8 (4.4)
|
|
Hypertension
|
|
|
|
|
383 (39.2)
|
36 (17.1)
|
486 (63.6)
|
322 (52.0)
|
365 (35.4)
|
35 (28.2)
|
|
|
|
|
1 (0.4)
|
|
|
|
|
|
|
|
120 (29.4)
|
14 (11.5)
|
171 (71.0)
|
252 (66.0)
|
30 (24.2)
|
3 (10.7)
|
32 (72.7)
|
28 (75.7)
|
196 (45.0)
|
13 (24.1)
|
158 (79.4)
|
135 (75.0)
|
|
Type 2 Diabetes
|
796 (10.0)
|
95 (10.1)
|
18,200 (10.9)
|
25,918 (12.8)
|
72 (7.4)
|
17 (8.1)
|
312 (40.8)
|
266 (43.0)
|
181 (17.5)
|
29 (23.4)
|
|
|
|
|
|
|
|
|
|
|
|
|
11 (2.7)
|
1 (0.8)
|
9 (3.7)
|
18 (4.7)
|
86 (69.4)
|
18 (64.3)
|
37 (84.1)
|
36 (97.3)
|
15 (3.4)
|
|
6 (3.0)
|
8 (4.4)
|
|
Heart Disease
|
|
|
|
|
209 (21.4)
|
32 (15.2)
|
379 (49.6)
|
292 (47.2)
|
518 (50.2)
|
52 (41.9)
|
|
|
|
|
1 (0.4)
|
|
|
|
|
|
|
|
45 (11.0)
|
11 (9.0)
|
28 (11.6)
|
60 (15.7)
|
46 (37.1)
|
7 (25.0)
|
29 (65.9)
|
24 (64.9)
|
34 (7.8)
|
4 (7.4)
|
46 (23.1)
|
13 (7.2)
|
|
Kidney Disease
|
361 (4.6)
|
47 (5.0)
|
7,628 (4.6)
|
9,486 (4.7)
|
222 (22.7)
|
43 (20.5)
|
383 (50.1)
|
261 (42.2)
|
512 (49.6)
|
44 (35.5)
|
|
|
5 (0.6)
|
|
2 (0.9)
|
|
|
|
|
|
|
|
79 (19.4)
|
25 (20.5)
|
191 (79.3)
|
308 (80.6)
|
94 (75.8)
|
16 (57.1)
|
38 (86.4)
|
29 (78.4)
|
73 (16.7)
|
9 (16.7)
|
160 (80.4)
|
141 (78.3)
|
|
Dyslipidemia
|
|
|
|
|
376 (38.4)
|
71 (33.8)
|
438 (57.3)
|
214 (34.6)
|
255 (24.7)
|
35 (28.2)
|
|
|
|
|
|
|
|
|
1 (0.6)
|
|
|
|
139 (34.1)
|
39 (32.0)
|
171 (71.0)
|
269 (70.4)
|
53 (42.7)
|
7 (25.0)
|
28 (63.6)
|
29 (78.4)
|
150 (34.4)
|
14 (25.9)
|
147 (73.9)
|
115 (63.9)
|
|
Stroke
|
|
|
|
|
298 (30.5)
|
62 (29.5)
|
267 (34.9)
|
90 (14.5)
|
603 (58.4)
|
73 (58.9)
|
|
|
|
|
|
|
|
|
|
|
|
|
63 (15.4)
|
16 (13.1)
|
32 (13.3)
|
49 (12.8)
|
66 (53.2)
|
11 (39.3)
|
31 (70.5)
|
26 (70.3)
|
48 (11.0)
|
5 (9.3)
|
44 (22.1)
|
14 (7.8)
|
|
Alcoholic Drinks / Week
|
1,109 (14.0)
|
448 (47.4)
|
34,518 (20.6)
|
70,722 (35.0)
|
|
|
123 (16.1)
|
202 (32.6)
|
509 (49.3)
|
57 (46.0)
|
|
|
|
|
|
|
|
|
|
|
|
|
34 (8.3)
|
3 (2.5)
|
2 (0.8)
|
7 (1.8)
|
|
|
|
|
33 (7.6)
|
2 (3.7)
|
3 (1.5)
|
1 (0.6)
|
|
Sugar-Sweetened Drinks / Week
|
|
|
|
|
106 (10.8)
|
27 (12.9)
|
172 (22.5)
|
51 (8.2)
|
762 (73.8)
|
94 (75.8)
|
|
|
|
|
|
|
|
|
|
|
|
|
75 (18.4)
|
17 (13.9)
|
6 (2.5)
|
10 (2.6)
|
|
|
|
|
87 (20.0)
|
4 (7.4)
|
4 (2.0)
|
1 (0.6)
|
|
Current Smoker
|
|
|
|
|
463 (47.3)
|
124 (59.0)
|
262 (34.3)
|
142 (22.9)
|
464 (45.0)
|
47 (37.9)
|
|
|
|
|
2 (0.9)
|
|
|
|
|
|
3 (2.8)
|
|
211 (51.7)
|
69 (56.6)
|
100 (41.5)
|
190 (49.7)
|
57 (46.0)
|
9 (32.1)
|
29 (65.9)
|
25 (67.6)
|
312 (71.6)
|
21 (38.9)
|
92 (46.2)
|
69 (38.3)
|
|
Family History of Gout
|
|
|
|
|
71 (7.3)
|
21 (10.0)
|
408 (53.4)
|
298 (48.1)
|
244 (23.6)
|
19 (15.3)
|
35 (79.5)
|
71 (89.9)
|
|
|
|
|
|
|
|
|
|
|
56 (13.7)
|
15 (12.3)
|
42 (17.4)
|
58 (15.2)
|
8 (6.5)
|
2 (7.1)
|
3 (6.8)
|
4 (10.8)
|
54 (12.4)
|
2 (3.7)
|
26 (13.1)
|
23 (12.8)
|
|
No. Relatives w/ Gout
|
|
|
|
|
282 (28.8)
|
64 (30.5)
|
531 (69.5)
|
459 (74.2)
|
643 (62.3)
|
84 (67.7)
|
|
|
|
|
|
|
|
|
|
|
|
|
104 (25.5)
|
30 (24.6)
|
65 (27.0)
|
88 (23.0)
|
36 (29.0)
|
8 (28.6)
|
17 (38.6)
|
10 (27.0)
|
84 (19.3)
|
13 (24.1)
|
45 (22.6)
|
46 (25.6)
|
|
Note:
|
|
‘All’ = all missing, ‘None’ = none missing
|
Full cohort stats table
table1 <- tibble("Cohort" = cohortstring,
"N" = unlist(lapply(data_list, nrow)),
"Age at Collection (years)" = unlist(lapply(data_list, function(x) report(x$AGECOL))),
"Serum Urate (mg/dL)" = unlist(lapply(data_list, function(x) report(x$URATE))),
"ULT" = unlist(lapply(data_list, function(x) sumreport(x$ULT))),
"Age at Onset (years)" = unlist(lapply(data_list, function(x) report(x$AGE1ATK))),
"Disease Duration (years)" = unlist(lapply(data_list, function(x) report(x$DURATION))),
"Number of Flares in Last Year" = unlist(lapply(data_list, function(x) report_median(x$NUMATK))),
"Presence of Tophi" = unlist(lapply(data_list, function(x) sumreport(x$TOPHIGOUT))),
"PRS" = unlist(lapply(data_list, function(x) report(x$PRS))),
"Prophylaxis" = unlist(lapply(data_list, function(x) sumreport(x$PROPHY))),
"BMI" = unlist(lapply(data_list, function(x) report(x$BMI))),
"Hypertension" = unlist(lapply(data_list, function(x) sumreport(x$HYPERTENSION))),
"Type 2 Diabetes" = unlist(lapply(data_list, function(x) sumreport(x$DIABETES))),
"Heart Disease" = unlist(lapply(data_list, function(x) sumreport(x$HEART))),
"Kidney Disease" = unlist(lapply(data_list, function(x) sumreport(x$KIDNEY))),
"Dyslipidemia" = unlist(lapply(data_list, function(x) sumreport(x$LIPIDS))),
"Stroke" = unlist(lapply(data_list, function(x) sumreport(x$STROKE))),
"Alcoholic Drinks / Week" = unlist(lapply(data_list, function(x) report(x$TOTALALC))),
"Sugar-Sweetened Drinks / Week" = unlist(lapply(data_list, function(x) report(x$SUGDRINK))),
"Current Smoker" = unlist(lapply(data_list, function(x) sumreport(x$CURSMOKE))),
"Family History of Gout" = unlist(lapply(data_list, function(x) sumreport(x$FAMGOUT))),
"No. Relatives w/ Gout" = unlist(lapply(data_list, function(x) report(x$FAMGOUTNUM))))
table1 <- transpose_df(table1) %>%
column_to_rownames(var = "Cohort")
write_csv(table1, file = here("Output/Tables/demographics.csv"))
table1 <- table1 %>%
mutate(across(.cols = 1:ncol(table1), ~ str_replace(string = .x, pattern = " ", replacement = " ")))
row.names(table1) <- str_replace(row.names(table1), " ", " ")
table1 %>%
kable(col.names = clean_names,
align = "c",
escape = F) %>%
kable_styling("striped") %>%
scroll_box(width = "900px", height = "475px") %>%
footnote("Flares reported as median (inter-quartile range). All other numeric variables reported as mean ± sd. All binary variables reported as N (%).")
|
|
UK Biobank Gout Male
|
UK Biobank Gout Female
|
UK Biobank Control Male
|
UK Biobank Control Female
|
Aus/NZ European Gout Male
|
Aus/NZ European Gout Female
|
Aus/NZ European Control Male
|
Aus/NZ European Control Female
|
GlobalGout Gout Male
|
GlobalGout Gout Female
|
GlobalGout Control Male
|
GlobalGout Control Female
|
Ardea LASSO Gout Male
|
Ardea LASSO Gout Female
|
Ardea CLEAR1 Gout Male
|
Ardea CLEAR1 Gout Female
|
Ardea CLEAR2 Gout Male
|
Ardea CLEAR2 Gout Female
|
Ardea CRYSTAL Gout Male
|
Ardea CRYSTAL Gout Female
|
Ardea LIGHT Gout Male
|
Ardea LIGHT Gout Female
|
East Polynesian Gout Male
|
East Polynesian Gout Female
|
East Polynesian Control Male
|
East Polynesian Control Female
|
East Polynesian Gout Male NP
|
East Polynesian Gout Female NP
|
East Polynesian Control Male NP
|
East Polynesian Control Female NP
|
West Polynesian Gout Male
|
West Polynesian Gout Female
|
West Polynesian Control Male
|
West Polynesian Control Female
|
|
N
|
7930
|
945
|
167249
|
202100
|
978
|
210
|
764
|
619
|
1032
|
124
|
44
|
79
|
819
|
65
|
230
|
16
|
239
|
7
|
175
|
4
|
109
|
8
|
408
|
122
|
241
|
382
|
124
|
28
|
44
|
37
|
436
|
54
|
199
|
180
|
|
Age at Collection (years)
|
59.9 ± 7.0
|
61.9 ± 6.1
|
57.0 ± 8.1
|
56.7 ± 7.9
|
62.4 ± 12.4
|
70.0 ± 12.7
|
54.9 ± 17.1
|
51.3 ± 17.3
|
60.1 ± 13.1
|
67.6 ± 11.1
|
60.0 ± 14.8
|
64.2 ± 11.5
|
51.4 ± 11.8
|
60.7 ± 10.6
|
52.3 ± 11.1
|
61.4 ± 7.4
|
53.0 ± 10.8
|
55.0 ± 16.0
|
53.9 ± 11.0
|
63.8 ± 5.4
|
53.3 ± 11.8
|
64.0 ± 16.1
|
54.3 ± 12.4
|
60.7 ± 11.7
|
43.9 ± 15.6
|
45.6 ± 14.8
|
59.7 ± 11.3
|
59.1 ± 13.3
|
49.8 ± 13.5
|
48.7 ± 17.1
|
47.5 ± 12.3
|
53.4 ± 13.4
|
39.3 ± 15.0
|
40.5 ± 15.4
|
|
Serum Urate (mg/dL)
|
6.7 ± 1.7
|
6.0 ± 2.0
|
5.9 ± 1.2
|
4.5 ± 1.1
|
6.7 ± 1.9
|
6.4 ± 2.3
|
5.5 ± 2.8
|
3.3 ± 2.6
|
7.4 ± 2.3
|
7.7 ± 2.6
|
6.6 ± 1.7
|
6.6 ± 1.7
|
8.9 ± 1.3
|
8.9 ± 1.4
|
7.9 ± 1.4
|
8.1 ± 1.2
|
7.9 ± 1.5
|
8.4 ± 2.0
|
8.8 ± 1.5
|
10.1 ± 1.2
|
9.3 ± 1.7
|
8.1 ± 1.4
|
7.0 ± 2.3
|
6.3 ± 2.5
|
6.5 ± 1.9
|
5.4 ± 1.7
|
7.0 ± 1.7
|
6.9 ± 2.4
|
6.3 ± 1.3
|
5.3 ± 1.3
|
7.7 ± 2.1
|
7.0 ± 2.7
|
6.6 ± 1.8
|
5.3 ± 1.7
|
|
ULT
|
4505 (56.8)
|
372 (39.4)
|
NA
|
NA
|
564 (99.6)
|
108 (98.2)
|
NA
|
NA
|
570 (75.7)
|
49 (59.8)
|
NA
|
NA
|
255 (31.2)
|
26 (40.6)
|
230 (100.0)
|
16 (100.0)
|
239 (100.0)
|
7 (100.0)
|
94 (100.0)
|
1 (100.0)
|
109 (100.0)
|
8 (100.0)
|
262 (92.6)
|
86 (96.6)
|
NA
|
NA
|
96 (93.2)
|
19 (73.1)
|
NA
|
NA
|
292 (94.2)
|
38 (95.0)
|
NA
|
NA
|
|
Age at Onset (years)
|
NA
|
NA
|
NA
|
NA
|
46.4 ± 15.8
|
59.5 ± 15.7
|
NA
|
NA
|
46.5 ± 14.0
|
57.8 ± 12.5
|
NA
|
NA
|
41.4 ± 13.4
|
55.1 ± 12.0
|
41.9 ± 12.4
|
55.2 ± 11.2
|
42.6 ± 13.2
|
46.1 ± 20.6
|
40.1 ± 13.0
|
61.5 ± 6.2
|
42.4 ± 13.1
|
55.2 ± 17.3
|
37.9 ± 14.0
|
49.4 ± 15.4
|
NA
|
NA
|
39.1 ± 15.2
|
46.0 ± 16.8
|
NA
|
NA
|
34.6 ± 12.0
|
44.3 ± 15.0
|
NA
|
NA
|
|
Disease Duration (years)
|
NA
|
NA
|
NA
|
NA
|
16.8 ± 12.7
|
10.9 ± 10.4
|
NA
|
NA
|
14.5 ± 11.4
|
10.6 ± 9.8
|
NA
|
NA
|
11.0 ± 9.4
|
6.6 ± 8.0
|
11.4 ± 9.4
|
7.1 ± 9.5
|
11.4 ± 9.8
|
9.9 ± 11.6
|
14.8 ± 10.0
|
3.2 ± 1.0
|
11.9 ± 8.7
|
9.8 ± 11.0
|
17.2 ± 12.8
|
13.1 ± 13.2
|
NA
|
NA
|
21.7 ± 15.3
|
14.1 ± 12.6
|
NA
|
NA
|
13.6 ± 10.3
|
9.2 ± 9.2
|
NA
|
NA
|
|
Number of Flares in Last Year
|
NA
|
NA
|
NA
|
NA
|
2 (0 - 4)
|
1.5 (0 - 3.25)
|
NA
|
NA
|
2 (1 - 4)
|
2.5 (1 - 4)
|
NA
|
NA
|
4 (3 - 8)
|
3 (3 - 6)
|
3 (2 - 6)
|
3 (3 - 4)
|
4 (2 - 8)
|
5 (3 - 6)
|
4 (3 - 6)
|
4.5 (2.25 - 6)
|
4 (2 - 10)
|
4 (2.75 - 5.25)
|
3 (1 - 6)
|
2 (0 - 5)
|
NA
|
NA
|
2 (0 - 3)
|
3 (1 - 6)
|
NA
|
NA
|
4 (2 - 10)
|
2 (1 - 5)
|
NA
|
NA
|
|
Presence of Tophi
|
NA
|
NA
|
NA
|
NA
|
333 (43.4)
|
67 (39.9)
|
NA
|
NA
|
320 (57.6)
|
46 (62.2)
|
NA
|
NA
|
138 (16.8)
|
5 (7.7)
|
34 (14.9)
|
1 (6.2)
|
54 (22.6)
|
2 (28.6)
|
174 (99.4)
|
4 (100.0)
|
26 (23.9)
|
5 (62.5)
|
144 (41.3)
|
26 (28.3)
|
NA
|
NA
|
9 (12.2)
|
4 (19.0)
|
NA
|
NA
|
177 (44.6)
|
14 (28.6)
|
NA
|
NA
|
|
PRS
|
4.0 ± 0.6
|
4.0 ± 0.7
|
3.7 ± 0.6
|
3.7 ± 0.6
|
4.1 ± 0.7
|
4.0 ± 0.6
|
3.7 ± 0.6
|
3.7 ± 0.6
|
4.0 ± 0.6
|
4.0 ± 0.6
|
3.8 ± 0.6
|
3.8 ± 0.6
|
4.1 ± 0.7
|
4.1 ± 0.6
|
4.2 ± 0.7
|
4.3 ± 0.6
|
4.2 ± 0.6
|
4.3 ± 0.8
|
4.2 ± 0.6
|
4.0 ± 0.4
|
4.1 ± 0.6
|
4.2 ± 0.2
|
4.4 ± 0.5
|
4.4 ± 0.5
|
4.2 ± 0.4
|
4.2 ± 0.5
|
4.2 ± 0.5
|
4.4 ± 0.5
|
4.2 ± 0.5
|
4.1 ± 0.6
|
4.8 ± 0.6
|
4.7 ± 0.6
|
4.3 ± 0.6
|
4.3 ± 0.6
|
|
Prophylaxis
|
NA
|
NA
|
NA
|
NA
|
56 (93.3)
|
13 (92.9)
|
0 (0.0)
|
NA
|
443 (62.0)
|
55 (67.9)
|
NA
|
NA
|
810 (98.9)
|
65 (100.0)
|
230 (100.0)
|
16 (100.0)
|
239 (100.0)
|
7 (100.0)
|
175 (100.0)
|
4 (100.0)
|
109 (100.0)
|
8 (100.0)
|
59 (98.3)
|
13 (92.9)
|
1 (100.0)
|
NA
|
112 (90.3)
|
23 (88.5)
|
1 (100.0)
|
NA
|
46 (100.0)
|
7 (77.8)
|
NA
|
0 (0.0)
|
|
BMI
|
30.5 ± 4.8
|
32.3 ± 6.6
|
27.7 ± 4.2
|
27.0 ± 5.1
|
30.3 ± 5.2
|
30.9 ± 7.3
|
27.2 ± 4.7
|
27.0 ± 6.2
|
29.4 ± 4.7
|
30.9 ± 6.7
|
NA
|
NA
|
34.1 ± 6.7
|
38.0 ± 10.3
|
34.6 ± 6.1
|
38.1 ± 6.5
|
33.7 ± 6.0
|
36.2 ± 7.5
|
32.2 ± 5.4
|
36.5 ± 3.8
|
31.3 ± 4.9
|
35.7 ± 8.7
|
35.4 ± 8.0
|
38.2 ± 9.8
|
31.9 ± 7.1
|
32.7 ± 8.5
|
35.9 ± 7.7
|
39.5 ± 7.6
|
32.5 ± 7.8
|
29.1 ± 6.1
|
36.1 ± 6.7
|
38.5 ± 9.1
|
33.1 ± 6.2
|
34.3 ± 7.7
|
|
Hypertension
|
5552 (70.0)
|
751 (79.5)
|
66308 (39.6)
|
61099 (30.2)
|
573 (96.3)
|
172 (98.9)
|
169 (60.8)
|
121 (40.7)
|
662 (99.3)
|
89 (100.0)
|
NA
|
NA
|
401 (49.0)
|
48 (73.8)
|
146 (63.8)
|
15 (93.8)
|
166 (69.5)
|
6 (85.7)
|
101 (57.7)
|
4 (100.0)
|
57 (52.3)
|
8 (100.0)
|
267 (92.7)
|
107 (99.1)
|
69 (98.6)
|
119 (91.5)
|
94 (100.0)
|
25 (100.0)
|
12 (100.0)
|
9 (100.0)
|
212 (88.3)
|
39 (95.1)
|
36 (87.8)
|
43 (95.6)
|
|
Type 2 Diabetes
|
1431 (20.1)
|
227 (26.7)
|
13115 (8.8)
|
8479 (4.8)
|
144 (15.9)
|
51 (26.4)
|
55 (12.2)
|
48 (13.6)
|
350 (41.1)
|
53 (55.8)
|
NA
|
NA
|
79 (9.6)
|
16 (24.6)
|
30 (13.0)
|
6 (37.5)
|
32 (13.4)
|
0 (0.0)
|
24 (13.7)
|
2 (50.0)
|
12 (11.0)
|
0 (0.0)
|
121 (30.5)
|
61 (50.4)
|
53 (22.8)
|
77 (21.2)
|
38 (100.0)
|
10 (100.0)
|
7 (100.0)
|
1 (100.0)
|
80 (19.0)
|
26 (48.1)
|
33 (17.1)
|
44 (25.6)
|
|
Heart Disease
|
2144 (27.0)
|
289 (30.6)
|
23067 (13.8)
|
12355 (6.1)
|
320 (41.6)
|
92 (51.7)
|
84 (21.8)
|
37 (11.3)
|
159 (30.9)
|
39 (54.2)
|
NA
|
NA
|
40 (4.9)
|
3 (4.6)
|
16 (7.0)
|
0 (0.0)
|
25 (10.5)
|
0 (0.0)
|
17 (9.7)
|
0 (0.0)
|
5 (4.6)
|
0 (0.0)
|
139 (38.3)
|
65 (58.6)
|
45 (21.1)
|
47 (14.6)
|
40 (51.3)
|
13 (61.9)
|
4 (26.7)
|
2 (15.4)
|
77 (19.2)
|
17 (34.0)
|
12 (7.8)
|
18 (10.8)
|
|
Kidney Disease
|
1079 (14.3)
|
276 (30.7)
|
5574 (3.5)
|
8907 (4.6)
|
359 (47.5)
|
122 (73.1)
|
214 (56.2)
|
251 (70.1)
|
238 (45.8)
|
64 (80.0)
|
NA
|
NA
|
138 (17.0)
|
28 (43.1)
|
28 (12.3)
|
9 (56.2)
|
38 (15.9)
|
4 (57.1)
|
31 (17.7)
|
2 (50.0)
|
13 (11.9)
|
4 (50.0)
|
153 (46.5)
|
71 (73.2)
|
34 (68.0)
|
57 (77.0)
|
28 (93.3)
|
12 (100.0)
|
4 (66.7)
|
8 (100.0)
|
130 (35.8)
|
32 (71.1)
|
21 (53.8)
|
30 (76.9)
|
|
Dyslipidemia
|
4064 (51.2)
|
503 (53.2)
|
47606 (28.5)
|
34238 (16.9)
|
494 (82.1)
|
117 (84.2)
|
169 (51.8)
|
143 (35.3)
|
563 (72.5)
|
71 (79.8)
|
NA
|
NA
|
331 (40.4)
|
37 (56.9)
|
110 (47.8)
|
13 (81.2)
|
98 (41.0)
|
4 (57.1)
|
78 (44.8)
|
3 (75.0)
|
41 (37.6)
|
5 (62.5)
|
235 (87.4)
|
77 (92.8)
|
56 (80.0)
|
90 (79.6)
|
71 (100.0)
|
21 (100.0)
|
16 (100.0)
|
8 (100.0)
|
238 (83.2)
|
36 (90.0)
|
38 (73.1)
|
37 (56.9)
|
|
Stroke
|
641 (8.1)
|
101 (10.7)
|
6727 (4.0)
|
4942 (2.4)
|
48 (7.1)
|
16 (10.8)
|
138 (27.8)
|
214 (40.5)
|
41 (9.6)
|
8 (15.7)
|
NA
|
NA
|
7 (0.9)
|
1 (1.5)
|
3 (1.3)
|
0 (0.0)
|
1 (0.4)
|
0 (0.0)
|
3 (1.7)
|
0 (0.0)
|
0 (0.0)
|
0 (0.0)
|
24 (7.0)
|
13 (12.3)
|
12 (5.7)
|
19 (5.7)
|
2 (3.4)
|
1 (5.9)
|
0 (0.0)
|
0 (0.0)
|
12 (3.1)
|
5 (10.2)
|
4 (2.6)
|
3 (1.8)
|
|
Alcoholic Drinks / Week
|
7.6 ± 5.4
|
6.0 ± 5.1
|
6.6 ± 5.2
|
5.8 ± 5.0
|
7.8 ± 10.5
|
2.4 ± 5.1
|
4.8 ± 9.6
|
2.7 ± 4.2
|
14.1 ± 19.2
|
4.4 ± 7.5
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
5.5 ± 14.3
|
1.9 ± 7.2
|
5.3 ± 10.9
|
2.4 ± 5.8
|
5.9 ± 8.6
|
2.2 ± 4.4
|
3.2 ± 6.0
|
3.8 ± 7.6
|
4.2 ± 9.0
|
0.9 ± 2.7
|
4.3 ± 11.0
|
1.2 ± 3.7
|
|
Sugar-Sweetened Drinks / Week
|
NA
|
NA
|
NA
|
NA
|
1.0 ± 1.5
|
0.6 ± 1.1
|
0.9 ± 1.3
|
0.5 ± 1.1
|
0.8 ± 1.3
|
0.7 ± 1.2
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
1.7 ± 1.9
|
1.0 ± 1.5
|
1.8 ± 2.5
|
1.2 ± 1.6
|
1.3 ± 1.7
|
0.8 ± 1.3
|
2.1 ± 2.2
|
1.4 ± 2.0
|
2.3 ± 2.2
|
1.4 ± 1.5
|
2.0 ± 1.7
|
1.4 ± 1.5
|
|
Current Smoker
|
482 (6.1)
|
74 (7.8)
|
14708 (8.8)
|
13750 (6.8)
|
22 (4.3)
|
5 (5.8)
|
26 (5.2)
|
20 (4.2)
|
94 (16.5)
|
14 (18.2)
|
NA
|
NA
|
NA
|
NA
|
40 (17.5)
|
0 (0.0)
|
30 (12.6)
|
0 (0.0)
|
35 (20.0)
|
0 (0.0)
|
19 (17.9)
|
0 (0.0)
|
35 (17.8)
|
4 (7.5)
|
41 (29.1)
|
45 (23.4)
|
16 (23.9)
|
3 (15.8)
|
4 (26.7)
|
4 (33.3)
|
12 (9.7)
|
2 (6.1)
|
23 (21.5)
|
15 (13.5)
|
|
Family History of Gout
|
NA
|
NA
|
NA
|
NA
|
402 (44.3)
|
89 (47.1)
|
58 (16.3)
|
76 (23.7)
|
270 (34.3)
|
40 (38.1)
|
3 (33.3)
|
2 (25.0)
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
239 (67.9)
|
80 (74.8)
|
87 (43.7)
|
148 (45.7)
|
79 (68.1)
|
22 (84.6)
|
15 (36.6)
|
19 (57.6)
|
237 (62.0)
|
32 (61.5)
|
66 (38.2)
|
61 (38.9)
|
|
No. Relatives w/ Gout
|
NA
|
NA
|
NA
|
NA
|
0.8 ± 1.0
|
1.0 ± 1.3
|
0.3 ± 0.6
|
0.5 ± 0.7
|
0.7 ± 0.8
|
0.9 ± 0.8
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
1.7 ± 2.2
|
1.8 ± 1.5
|
0.8 ± 1.3
|
0.7 ± 1.0
|
2.0 ± 1.9
|
2.2 ± 1.5
|
0.8 ± 1.0
|
1.3 ± 1.4
|
1.5 ± 1.9
|
1.5 ± 1.9
|
0.6 ± 0.9
|
0.6 ± 0.9
|
|
Note:
|
|
Flares reported as median (inter-quartile range). All other numeric variables reported as mean ± sd. All binary variables reported as N (%).
|
Making table of model results.